From 880f3d5e7f00a9d9b39705b6dffe3b49b7773454 Mon Sep 17 00:00:00 2001 From: Aswin Zayasankaran <156493059+Aswinmcw@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:37:02 +0530 Subject: [PATCH 01/59] [CCL] Add separate CMakeLists.txt for CCL ops (#15649) ### Ticket #15636 ### Problem description Needs separate CMakeLists.txt for CCL ops ### What's changed Add separate CMakeLists.txt for CCL ops ### Checklist - [x] Post commit CI passes - https://github.com/tenstorrent/tt-metal/actions/runs/12139056025 - [x] T3k pipelines - https://github.com/tenstorrent/tt-metal/actions/runs/12139058716 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- ttnn/CMakeLists.txt | 31 +++---------------- ttnn/cpp/ttnn/operations/ccl/CMakeLists.txt | 26 ++++++++++++++++ .../experimental/ccl/CMakeLists.txt | 12 +++++++ 3 files changed, 42 insertions(+), 27 deletions(-) create mode 100644 ttnn/cpp/ttnn/operations/ccl/CMakeLists.txt create mode 100644 ttnn/cpp/ttnn/operations/experimental/ccl/CMakeLists.txt diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 723bf1d48330..26c658482223 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -12,34 +12,7 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/graph/graph_processor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/graph/graph_trace_utils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/graph/graph_pybind.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/all_gather/all_gather.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/all_gather_matmul_pybind.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/all_gather_matmul_op.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/ccl/all_reduce/all_reduce_pybind.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/experimental/ccl/all_reduce/device/all_reduce_op.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/ccl_common.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/common/uops/ccl_command.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_common.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/barrier/device/barrier_op.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/barrier/barrier.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl/barrier/barrier_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/conv/conv2d/conv2d.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/conv/conv2d/conv2d_utils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp @@ -612,11 +585,15 @@ endforeach() ### Setup TTNN as a shared library with optional Python bindings add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/tensor) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/ccl) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations//experimental/ccl) add_subdirectory(cpp/ttnn/deprecated) set(TTNN_FINAL_SRC ${TTNN_SRC} ${QUEUE_SRCS} ${TENSOR_SRCS} + ${CCL_TTNN_SRCS} + ${CCL_EXPERIMENTAL_TTNN_SRCS} ${TT_DNN_SRCS} ) diff --git a/ttnn/cpp/ttnn/operations/ccl/CMakeLists.txt b/ttnn/cpp/ttnn/operations/ccl/CMakeLists.txt new file mode 100644 index 000000000000..148d928be910 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/ccl/CMakeLists.txt @@ -0,0 +1,26 @@ +set(CCL_TTNN_SRCS + # Common + ${CMAKE_CURRENT_SOURCE_DIR}/erisc_datamover_builder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ccl_op_fusion.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ccl_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ccl_host_datastructures.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/common/types/ccl_types_args_emitters.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/common/uops/ccl_command.cpp + # CCL Ops + ${CMAKE_CURRENT_SOURCE_DIR}/all_gather/all_gather.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_gather/all_gather_pybind.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_gather/device/all_gather_op.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_gather/device/multi_core/all_gather_op_multi_core.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter/device/reduce_scatter_op.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter/reduce_scatter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter/reduce_scatter_pybind.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter/host/reduce_scatter_worker_builder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter/host/reduce_scatter_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/barrier/device/host/barrier_full_worker_grid.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/barrier/device/barrier_op.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/barrier/barrier.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/barrier/barrier_pybind.cpp + CACHE INTERNAL + "CCL sources to reuse in ttnn build" +) diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/CMakeLists.txt b/ttnn/cpp/ttnn/operations/experimental/ccl/CMakeLists.txt new file mode 100644 index 000000000000..82767c44a093 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/CMakeLists.txt @@ -0,0 +1,12 @@ +set(CCL_EXPERIMENTAL_TTNN_SRCS + #Experimental Ops + ${CMAKE_CURRENT_SOURCE_DIR}/all_gather_matmul/all_gather_matmul.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_gather_matmul/all_gather_matmul_pybind.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_gather_matmul/device/all_gather_matmul_op.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_reduce/all_reduce.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_reduce/all_reduce_pybind.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/all_reduce/device/all_reduce_op.cpp + CACHE INTERNAL + "CCL Experimental sources to reuse in ttnn build" +) From 4393945d01a0c263820b84660b126d8b7982a614 Mon Sep 17 00:00:00 2001 From: Sankar Manoj Date: Mon, 9 Dec 2024 18:19:39 +0530 Subject: [PATCH 02/59] #0: Fix Squeezebert (#15828) Force merging to fix model perf. ### Problem description Squeezebert fails after changes to Conv1DConfig. ### What's changed Removed compute kernel args from Conv1dConfig. ### Checklist - [x] Model regression CI testing passes (https://github.com/tenstorrent/tt-metal/actions/runs/12234614018) --- .../squeezebert/tt/ttnn_functional_squeezebert.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/models/demos/squeezebert/tt/ttnn_functional_squeezebert.py b/models/demos/squeezebert/tt/ttnn_functional_squeezebert.py index 250e302d2c21..7355663e951d 100644 --- a/models/demos/squeezebert/tt/ttnn_functional_squeezebert.py +++ b/models/demos/squeezebert/tt/ttnn_functional_squeezebert.py @@ -71,9 +71,6 @@ def ttnn_conv1d( conv_config = ttnn.Conv1dConfig( dtype=ttnn.bfloat16, weights_dtype=ttnn.bfloat8_b, - math_approx_mode_enabled=math_approx, - fp32_dest_acc_enabled=fp32_accum, - packer_l1_accum_enabled=packer_l1_acc, activation=activation, input_channels_alignment=(16 if use_shallow_conv_variant else 32), deallocate_activation=deallocate_activation, @@ -84,7 +81,13 @@ def ttnn_conv1d( ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED ), core_grid=get_shard_grid_from_num_cores(56, device), + ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=math_fidelity, + math_approx_mode=math_approx, + fp32_dest_acc_en=fp32_accum, + packer_l1_acc=packer_l1_acc, ) [tt_output_tensor_on_device, out_length, weights_device, bias_device] = ttnn.Conv1d( @@ -100,6 +103,7 @@ def ttnn_conv1d( batch_size=tt_input_tensor.shape[0], input_length=tt_input_tensor.shape[1], conv_config=conv_config, + compute_config=compute_config, conv_op_cache={}, debug=debug, groups=groups, From 41ce2b28085bd947c1d65cf1f875760ca5341132 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Mon, 9 Dec 2024 21:28:55 +0530 Subject: [PATCH 03/59] Don't use tensix_types.h in ttnn (#15806) --- .../core/compute_kernel/compute_kernel_config.cpp | 5 +++-- .../device/create_qkv_heads_program_factory.cpp | 8 +++----- ...e_qkv_heads_from_separate_tensors_device_operation.cpp | 6 ++---- .../cpp/ttnn/operations/reduction/topk/device/topk_op.cpp | 7 +++---- .../reduction/topk/device/topk_program_factory.hpp | 7 +++---- 5 files changed, 14 insertions(+), 19 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp b/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp index 7a249b45264f..90cf3942767f 100644 --- a/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp +++ b/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp @@ -8,8 +8,9 @@ #define DATUMS_PER_ROW 16 -// FIXME: ARCH_NAME specific include -#include "tensix_types.h" // DEST_REGISTER_FULL_SIZE +// This parameter is the same for all supported architectures +// Check this invariant when adding new architectures +#define DEST_REGISTER_FULL_SIZE 64 * 16 namespace ttnn { diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp index 8b0ce2c5053b..b37b6ae1686f 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp @@ -7,9 +7,6 @@ #include "tt_metal/common/constants.hpp" #include "tt_metal/detail/util.hpp" -// FIXME: ARCH_NAME specific include -#include "tensix_types.h" // L1_SIZE - using namespace tt::constants; using namespace tt; @@ -92,14 +89,15 @@ static inline operation::ProgramWithCallbacks create_heads_combined_qkv_sharded( block_ht * TILE_HEIGHT); uint32_t per_core_tiles = block_ht * block_wt; + const uint32_t l1_size = input_tensor.device()->l1_size_per_core(); auto data_format = tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype()); uint32_t single_tile_size = tile_size(data_format); TT_FATAL( - L1_SIZE >= 2 * per_core_tiles * single_tile_size, + l1_size >= 2 * per_core_tiles * single_tile_size, "Workload of Tiles {} at Tile Size {} (times 2 for output) exceeds L1 capacity {}", per_core_tiles, single_tile_size, - L1_SIZE); + l1_size); std::vector num_tiles_per_group; num_tiles_per_group.reserve(output.size()); diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp index a0fcbe427b60..fa69d508b164 100644 --- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp @@ -7,9 +7,6 @@ #include "tt_metal/host_api.hpp" -// FIXME: ARCH_NAME specific include -#include "tensix_types.h" // L1_SIZE - namespace ttnn::operations::experimental::transformer { void CreateQKVHeadsSeparateTensorsDeviceOperation::validate(const std::vector& input_tensors) const { @@ -122,10 +119,11 @@ void CreateQKVHeadsSeparateTensorsDeviceOperation::validate(const std::vectorl1_size_per_core(); const uint32_t single_tile_size = tt::tile_size(tt::tt_metal::datatype_to_dataformat_converter(q_input_tensor.get_dtype())); TT_FATAL( - L1_SIZE >= 2 * (per_core_q_tiles + 2 * per_core_k_tiles) * single_tile_size, "Workload exceeds L1 capacity"); + l1_size >= 2 * (per_core_q_tiles + 2 * per_core_k_tiles) * single_tile_size, "Workload exceeds L1 capacity"); // TODO: Add this back when output is HEIGHT sharded only! // TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Error"); diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp index 8d96b25ffec5..7ceb4d619161 100644 --- a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp @@ -5,9 +5,6 @@ #include "topk_op.hpp" #include "topk_program_factory.hpp" -// FIXME: ARCH_NAME specific include -#include "tensix_types.h" // L1_SIZE - namespace topk_utils { static inline bool verify_available_cores( @@ -16,6 +13,7 @@ static inline bool verify_available_cores( uint16_t max_dim, CoreCoord grid, uint16_t k, + const uint32_t l1_size, const uint32_t value_tile_size, const uint32_t index_tile_size) { const auto max_cores = grid.y - 1; // reserve one core for the gather - switch to grid.x as it allows for more @@ -30,7 +28,7 @@ static inline bool verify_available_cores( (split_size / tt::constants::TILE_WIDTH) * (value_tile_size + index_tile_size); // we divide the width into split_size chunks and each chunk, as well // as a matching set of indices, is processed by a core - if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < L1_SIZE && num_cores > 1) { + if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < l1_size && num_cores > 1) { return true; } } @@ -79,6 +77,7 @@ void TopK::validate_with_output_tensors( input_shape[this->dim] / 2, device->compute_with_storage_grid_size(), this->k, + device->l1_size_per_core(), value_tile_size, index_tile_size), "Not enough cores available to run topk operation"); diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp index ab854db36201..1996aacd555b 100644 --- a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp @@ -8,9 +8,6 @@ #include "tt_metal/host_api.hpp" #include "tt_log.h" -// FIXME: ARCH_NAME specific include -#include "tensix_types.h" // L1_SIZE - namespace ttnn::operations::reduction::detail { operation::ProgramWithCallbacks topk_single_core_interleaved( @@ -179,6 +176,7 @@ static inline std::tuple cores_utilized( uint16_t max_dim, CoreCoord grid, uint16_t k, + const uint32_t l1_size, const uint32_t value_tile_size, const uint32_t index_tile_size) { const auto max_cores = grid.y - 1; // reserve one core for the gather - switch to grid.x as it allows for more @@ -193,7 +191,7 @@ static inline std::tuple cores_utilized( (split_size / tt::constants::TILE_WIDTH) * (value_tile_size + index_tile_size); // we divide the width into split_size chunks and each chunk, as well // as a matching set of indices, is processed by a core - if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < L1_SIZE && num_cores > 1) { + if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < l1_size && num_cores > 1) { return {num_cores + 1, split_size, rem, num_cores * k}; } } @@ -237,6 +235,7 @@ operation::ProgramWithCallbacks topk_multicore_interleaved( input_shape[dim] / 2, device->compute_with_storage_grid_size(), k, + device->l1_size_per_core(), value_tile_size, index_tile_size); From cac4dcbef543f497fe82940d7aa1541d92e58455 Mon Sep 17 00:00:00 2001 From: Raymond Kim Date: Mon, 9 Dec 2024 11:15:04 -0500 Subject: [PATCH 04/59] #0: [skip ci] Add two more pipelines to data collection --- .github/workflows/_produce-data.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml index 7328c0b9b403..c66c5bb57023 100644 --- a/.github/workflows/_produce-data.yaml +++ b/.github/workflows/_produce-data.yaml @@ -22,6 +22,7 @@ on: - "(Single-card) Model perf tests" - "(Single-card) Device perf tests" - "(Single-card) Demo tests" + - "(Single-card) Tests for new models" - "Nightly fast dispatch tests" - "(Single-card) Tests for new models" - "(T3K) T3000 demo tests" @@ -40,6 +41,7 @@ on: - "(TGG) TGG frequent tests" - "ttnn - Run sweeps" - "Blackhole post-commit tests" + - "Custom test dispatch" types: - completed From c1a246ef0563d409dd0f6bf4e42d6243dd1a280d Mon Sep 17 00:00:00 2001 From: Sanjay Poojary Date: Mon, 9 Dec 2024 10:31:54 -0600 Subject: [PATCH 05/59] Remove firmware dependency on generated_bank_to_noc_coord_mapping.h (#15819) ### Ticket https://github.com/tenstorrent/tt-metal/issues/12844 ### Problem description Firmware build is dependent on runtime values. ### What's changed This PR is same as this one - https://github.com/tenstorrent/tt-metal/pull/15070. Only difference is the 'uninit' changes has been removed. The original PR changes were reverted since the CI had ND hangs on N300. Removing 'uninit' has fixed the hang. Tested with multiple CI runs. https://github.com/tenstorrent/tt-metal/actions/runs/12222728886 https://github.com/tenstorrent/tt-metal/actions/runs/12206639040 https://github.com/tenstorrent/tt-metal/actions/runs/12227870153 https://github.com/tenstorrent/tt-metal/actions/runs/12237979786 Firmware now declares a global array for dram_bank_to_noc_xy, l1_bank_to_noc_xy, bank_to_dram_offset, bank_to_l1_offset. During build, values are written to L1 memory. Firmware during initialization would copy these values from L1 to the above global arrays. Moved l1_to_local_mem_copy to substitutes.cpp. Removed 'inline' keyword as the function is used in multiple places and let LTO decide the inlining ### Checklist - [x] Post commit CI passes - https://github.com/tenstorrent/tt-metal/actions/runs/12227870153 - [x] Blackhole Post commit (if applicable) - https://github.com/tenstorrent/tt-metal/actions/runs/12227923651 - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- tt_metal/hw/firmware/src/brisc.cc | 10 +- tt_metal/hw/firmware/src/erisc.cc | 10 +- tt_metal/hw/firmware/src/idle_erisc.cc | 10 +- tt_metal/hw/firmware/src/ncrisc.cc | 10 +- tt_metal/hw/firmware/src/slave_idle_erisc.cc | 1 - tt_metal/hw/inc/blackhole/dev_mem_map.h | 11 ++ .../hw/inc/blackhole/eth_l1_address_map.h | 13 ++ tt_metal/hw/inc/dataflow_api.h | 14 +- tt_metal/hw/inc/firmware_common.h | 50 +++---- tt_metal/hw/inc/grayskull/dev_mem_map.h | 14 +- .../hw/inc/grayskull/eth_l1_address_map.h | 2 + tt_metal/hw/inc/wormhole/dev_mem_map.h | 14 +- tt_metal/hw/inc/wormhole/eth_l1_address_map.h | 11 ++ tt_metal/hw/toolchain/substitutes.cpp | 31 +++++ tt_metal/impl/device/device.cpp | 73 ++++++++--- tt_metal/impl/device/device.hpp | 8 +- tt_metal/impl/kernels/kernel.cpp | 2 - tt_metal/jit_build/build.hpp | 1 - tt_metal/jit_build/genfiles.cpp | 124 ------------------ tt_metal/jit_build/genfiles.hpp | 9 -- tt_metal/llrt/blackhole/bh_hal_active_eth.cpp | 4 + tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp | 2 + tt_metal/llrt/blackhole/bh_hal_tensix.cpp | 2 + tt_metal/llrt/grayskull/gs_hal.cpp | 2 + tt_metal/llrt/hal.hpp | 1 + tt_metal/llrt/wormhole/wh_hal_active_eth.cpp | 3 + tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp | 2 + tt_metal/llrt/wormhole/wh_hal_tensix.cpp | 2 + 28 files changed, 239 insertions(+), 197 deletions(-) diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 320b779d936f..5554f2edcf37 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -20,7 +20,6 @@ #include "tools/profiler/kernel_profiler.hpp" #include "dev_msgs.h" #include "risc_attribs.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "circular_buffer.h" #include "circular_buffer_init.h" #include "dataflow_api.h" @@ -67,6 +66,13 @@ uint32_t tt_l1_ptr *rta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *crta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used)); +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + #define MEM_MOVER_VIEW_IRAM_BASE_ADDR (0x4 << 12) #if defined(PROFILE_KERNEL) @@ -343,6 +349,8 @@ int main() { do_crt1((uint32_t*)MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH); + noc_bank_table_init(MEM_BANK_TO_NOC_SCRATCH); + mailboxes->launch_msg_rd_ptr = 0; // Initialize the rdptr to 0 noc_index = 0; risc_init(); diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index dcf1ffc60a7b..44d760a069c2 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -5,7 +5,6 @@ #include "ethernet/dataflow_api.h" #include "ethernet/tunneling.h" #include "firmware_common.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "noc_parameters.h" #include "risc_attribs.h" #include "tools/profiler/kernel_profiler.hpp" @@ -34,6 +33,13 @@ uint32_t tt_l1_ptr *rta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *crta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used)); +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + void __attribute__((noinline)) Application(void) { WAYPOINT("I"); @@ -43,6 +49,8 @@ void __attribute__((noinline)) Application(void) { rtos_context_switch_ptr = (void (*)())RtosTable[0]; + noc_bank_table_init(eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH); + risc_init(); noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR); diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index 4e027e0dd7fb..455629e95c7b 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -19,7 +19,6 @@ #include "tools/profiler/kernel_profiler.hpp" #include "dev_msgs.h" #include "risc_attribs.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "circular_buffer.h" #include "dataflow_api.h" @@ -42,6 +41,13 @@ uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used uint8_t my_x[NUM_NOCS] __attribute__((used)); uint8_t my_y[NUM_NOCS] __attribute__((used)); +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + //c_tensix_core core; tt_l1_ptr mailboxes_t * const mailboxes = (tt_l1_ptr mailboxes_t *)(MEM_IERISC_MAILBOX_BASE); @@ -101,6 +107,8 @@ int main() { do_crt1((uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH); uint32_t heartbeat = 0; + noc_bank_table_init(MEM_IERISC_BANK_TO_NOC_SCRATCH); + risc_init(); mailboxes->slave_sync.all = RUN_SYNC_MSG_ALL_SLAVES_DONE; diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index fb3c6e566b3a..ba91c04713b1 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -11,7 +11,6 @@ #include "firmware_common.h" #include "tools/profiler/kernel_profiler.hpp" #include "risc_attribs.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "circular_buffer.h" #include "circular_buffer_init.h" @@ -40,6 +39,13 @@ uint32_t tt_l1_ptr *rta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *crta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used)); +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + #if defined(PROFILE_KERNEL) namespace kernel_profiler { uint32_t wIndex __attribute__((used)); @@ -79,6 +85,8 @@ int main(int argc, char *argv[]) { do_crt1((uint32_t tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH); + noc_bank_table_init(MEM_BANK_TO_NOC_SCRATCH); + risc_init(); // If NCRISC has IRAM it needs to halt before BRISC copies data from L1 to IRAM diff --git a/tt_metal/hw/firmware/src/slave_idle_erisc.cc b/tt_metal/hw/firmware/src/slave_idle_erisc.cc index 164313f27df6..8e0b4500a7ac 100644 --- a/tt_metal/hw/firmware/src/slave_idle_erisc.cc +++ b/tt_metal/hw/firmware/src/slave_idle_erisc.cc @@ -11,7 +11,6 @@ #include "firmware_common.h" #include "tools/profiler/kernel_profiler.hpp" #include "risc_attribs.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "circular_buffer.h" #include "debug/waypoint.h" diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h index 4f68f18e9af2..3ef1012727ab 100644 --- a/tt_metal/hw/inc/blackhole/dev_mem_map.h +++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h @@ -41,6 +41,11 @@ #define MEM_NCRISC_LOCAL_SIZE (8 * 1024) #define MEM_TRISC_LOCAL_SIZE (4 * 1024) +// Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_TO_NOC_XY_SIZE 1024 +// Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_OFFSET_SIZE 1024 + ///////////// // Firmware/kernel code holes #define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 128) @@ -91,6 +96,9 @@ #define MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC0_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) #define MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SCRATCH (MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + ///////////// // Stack info // Increasing the stack size comes at the expense of less local memory for globals @@ -130,6 +138,9 @@ #define MEM_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE) #define MEM_SLAVE_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_SLAVE_IERISC_LOCAL_SIZE - MEM_SLAVE_IERISC_STACK_SIZE) +#define MEM_IERISC_BANK_TO_NOC_SCRATCH (MEM_SLAVE_IERISC_INIT_LOCAL_L1_BASE_SCRATCH + MEM_SLAVE_IERISC_LOCAL_SIZE) +#define MEM_IERISC_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + ///////////// // Padding/alignment restriction needed in linker scripts for erisc #define MEM_IERISC_KERNEL_PAD 32 diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h index e99d13af3d4b..05d071dfdb43 100644 --- a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h @@ -26,6 +26,13 @@ struct address_map { static constexpr std::int32_t DATA_BUFFER_SIZE_ETH = 4 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE_NOC = 16 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE = 24 * 1024; + // Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + + // NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_XY_SIZE = 1024; + // Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + + // NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_OFFSET_SIZE = 1024; + // Kernel config buffer is WIP // Size is presently based on the old sizes of the RTAs + CB config + Sems static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_SIZE = 96 * 4 + 8 * 16; @@ -65,6 +72,12 @@ struct address_map { static_assert((ERISC_L1_UNRESERVED_BASE % 32) == 0); + // This scratch address is same as ERISC_L1_UNRESERVED_BASE, as the scratch space is used to copy data during + // runtime build, and is unused once FW copies the data to local memory during FW initialization. + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = + (ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE + 31) & ~31; + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = ERISC_MEM_BANK_TO_NOC_XY_SIZE + ERISC_MEM_BANK_OFFSET_SIZE; + static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = L1_EPOCH_Q_BASE + 4; // BIDIR Tunneling Kernel Space diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index f5ee832f60b5..59f6fc28963e 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -10,9 +10,7 @@ #include "chlkc_unpack_tile_dims.h" #define DATA_FORMATS_DEFINED #endif -#if __has_include("generated_bank_to_noc_coord_mapping.h") -#include "generated_bank_to_noc_coord_mapping.h" -#endif +#include #include @@ -37,9 +35,15 @@ constexpr uint8_t proc_type = static_cast diff --git a/tt_metal/hw/inc/firmware_common.h b/tt_metal/hw/inc/firmware_common.h index c292a7261a86..9f051b32abb1 100644 --- a/tt_metal/hw/inc/firmware_common.h +++ b/tt_metal/hw/inc/firmware_common.h @@ -13,39 +13,17 @@ #include "dev_mem_map.h" #include "hostdevcommon/kernel_structs.h" #include "dev_msgs.h" +#include "noc/noc_parameters.h" +#include "debug/dprint.h" + +extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS]; +extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS]; +extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS]; +extern int32_t bank_to_l1_offset[NUM_L1_BANKS]; extern void kernel_init(uint32_t kernel_init); extern void kernel_launch(uint32_t kernel_base_addr); - -inline void l1_to_local_mem_copy(uint32_t* dst, uint32_t tt_l1_ptr* src, int32_t len) { -#pragma GCC unroll 0 - while (len >= 3) { - auto v0 = src[0], v1 = src[1], v2 = src[2]; - // 1) Make sure the optimizer does not think this is memcpy by - // hiding the pointer bookkeeping in an asm. - // 2) The scheduler doesn't know the above loads have 6 cycle - // latency. We emit the 3 bookkeeping adds as a single block - // in the load shadow before the stores. The optimizer will - // not be able to move these. - // 3) We don't need early clobbers here because of the +r - // constraint -- early clobbers would pessimize. - asm inline( - "addi %0,%0,3*%3\n\t" - "addi %1,%1,3*%3\n\t" - "addi %2,%2,-3" - : "+r"(src), "+r"(dst), "+r"(len) - : "i"(sizeof(v0))); - dst[-3] = v0, dst[-2] = v1, dst[-1] = v2; - } - // There are 0, 1 or 2 words of residue. This is smaller than a loop. - // We get smaller code layout by expecting the conditions to be true. - if (__builtin_expect(len >= 1, true)) { - dst[0] = src[0]; - if (__builtin_expect(len >= 2, true)) { - dst[1] = src[1]; - } - } -} +void l1_to_local_mem_copy(uint32_t* dst, uint32_t tt_l1_ptr* src, int32_t len); inline void do_crt1(uint32_t tt_l1_ptr* data_image) { // Clear bss. @@ -59,6 +37,18 @@ inline void do_crt1(uint32_t tt_l1_ptr* data_image) { l1_to_local_mem_copy(__ldm_data_start, data_image, __ldm_data_end - __ldm_data_start); } +inline void noc_bank_table_init(uint64_t mem_bank_to_noc_addr) { + int32_t dram_to_noc_size_bytes = sizeof(dram_bank_to_noc_xy); + l1_to_local_mem_copy((uint*)dram_bank_to_noc_xy, (uint tt_l1_ptr*)mem_bank_to_noc_addr, dram_to_noc_size_bytes >> 2); + int32_t l1_to_noc_size_bytes = sizeof(l1_bank_to_noc_xy); + l1_to_local_mem_copy((uint*)l1_bank_to_noc_xy, (uint tt_l1_ptr*)(mem_bank_to_noc_addr + dram_to_noc_size_bytes), l1_to_noc_size_bytes >> 2); + + int32_t dram_offsets_size_bytes = sizeof(bank_to_dram_offset); + l1_to_local_mem_copy((uint*)bank_to_dram_offset, (uint tt_l1_ptr*)(mem_bank_to_noc_addr + dram_to_noc_size_bytes + l1_to_noc_size_bytes), dram_offsets_size_bytes >> 2); + int32_t l1_offsets_size_bytes = sizeof(bank_to_l1_offset); + l1_to_local_mem_copy((uint*)bank_to_l1_offset, (uint tt_l1_ptr*)(mem_bank_to_noc_addr + dram_to_noc_size_bytes + l1_to_noc_size_bytes + dram_offsets_size_bytes), l1_offsets_size_bytes >> 2); +} + FORCE_INLINE uint32_t firmware_config_init( tt_l1_ptr mailboxes_t* const mailboxes, uint32_t core_type_index, uint32_t dispatch_class) { diff --git a/tt_metal/hw/inc/grayskull/dev_mem_map.h b/tt_metal/hw/inc/grayskull/dev_mem_map.h index ba2077838c24..d7d829e73925 100644 --- a/tt_metal/hw/inc/grayskull/dev_mem_map.h +++ b/tt_metal/hw/inc/grayskull/dev_mem_map.h @@ -40,15 +40,20 @@ #define MEM_NCRISC_LOCAL_SIZE (4 * 1024) #define MEM_TRISC_LOCAL_SIZE (2 * 1024) +// Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_TO_NOC_XY_SIZE 1024 +// Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_OFFSET_SIZE 1024 + #define NCRISC_HAS_IRAM 1 #define MEM_NCRISC_IRAM_BASE 0xFFC00000 #define MEM_NCRISC_IRAM_SIZE (16 * 1024) ///////////// // Firmware/kernel code holes -#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 416) +#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 624) // TODO: perhaps put NCRISC FW in the scratch area and free 1.5K after init (GS/WH) -#define MEM_NCRISC_FIRMWARE_SIZE 1616 +#define MEM_NCRISC_FIRMWARE_SIZE 1824 #define MEM_TRISC0_FIRMWARE_SIZE 1536 #define MEM_TRISC1_FIRMWARE_SIZE 1536 #define MEM_TRISC2_FIRMWARE_SIZE 1536 @@ -100,6 +105,9 @@ #define MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC0_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) #define MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SCRATCH (MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + ///////////// // Stack info // Increasing the stack size comes at the expense of less local memory for globals @@ -125,5 +133,7 @@ #define MEM_IERISC_MAP_END 0 #define MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH 0 #define MEM_IERISC_STACK_SIZE 0 +#define MEM_IERISC_BANK_TO_NOC_SCRATCH 0 +#define MEM_IERISC_BANK_TO_NOC_SIZE 0 #define MEM_IERISC_KERNEL_PAD 0 diff --git a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h index 0ad8580b15b1..26332938fcbc 100644 --- a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h +++ b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h @@ -37,6 +37,8 @@ struct address_map { static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = 0; static constexpr std::int32_t ERISC_L1_TUNNEL_BUFFER_SIZE = 0; + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = 0; + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = 0; static constexpr std::uint32_t RETRAIN_COUNT_ADDR = 0x1EDC; static constexpr std::uint32_t RETRAIN_FORCE_ADDR = 0x1EFC; diff --git a/tt_metal/hw/inc/wormhole/dev_mem_map.h b/tt_metal/hw/inc/wormhole/dev_mem_map.h index c107c20d4b96..0d9e1dd932c7 100644 --- a/tt_metal/hw/inc/wormhole/dev_mem_map.h +++ b/tt_metal/hw/inc/wormhole/dev_mem_map.h @@ -41,13 +41,18 @@ #define MEM_NCRISC_LOCAL_SIZE (4 * 1024) #define MEM_TRISC_LOCAL_SIZE (2 * 1024) +// Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_TO_NOC_XY_SIZE 1024 +// Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_OFFSET_SIZE 1024 + #define NCRISC_HAS_IRAM 1 #define MEM_NCRISC_IRAM_BASE 0xFFC00000 #define MEM_NCRISC_IRAM_SIZE (16 * 1024) ///////////// // Firmware/kernel code holes -#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 64) +#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 256) // TODO: perhaps put NCRISC FW in the scratch area and free 1.5K after init (GS/WH) #define MEM_NCRISC_FIRMWARE_SIZE 1536 #define MEM_TRISC0_FIRMWARE_SIZE 1536 @@ -102,6 +107,9 @@ #define MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC0_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) #define MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SCRATCH (MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + ///////////// // Stack info // Increasing the stack size comes at the expense of less local memory for globals @@ -137,6 +145,10 @@ #define MEM_IERISC_STACK_SIZE 1024 #define MEM_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE) +#define MEM_IERISC_BANK_TO_NOC_SCRATCH (MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH + MEM_IERISC_LOCAL_SIZE) +#define MEM_IERISC_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + + ///////////// // Padding/alignment restriction needed in linker scripts for erisc #define MEM_IERISC_KERNEL_PAD 32 diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index 68e67eb92481..39d41601bef9 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -26,6 +26,11 @@ struct address_map { static constexpr std::int32_t DATA_BUFFER_SIZE_ETH = 4 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE_NOC = 16 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE = 24 * 1024; + // Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_XY_SIZE = 1024; + // Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_OFFSET_SIZE = 1024; + // Kernel config buffer is WIP // Size is presently based on the old sizes of the RTAs + CB config + Sems static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_SIZE = 96 * 4 + 8 * 16; @@ -65,6 +70,12 @@ struct address_map { static_assert((ERISC_L1_UNRESERVED_BASE % 32) == 0); + // This scratch address is same as ERISC_L1_UNRESERVED_BASE, as the scratch space is used to copy data during + // runtime build, and is unused once FW copies the data to local memory during FW initialization. + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = + (ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE + 31) & ~31; + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = ERISC_MEM_BANK_TO_NOC_XY_SIZE + ERISC_MEM_BANK_OFFSET_SIZE; + static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = L1_EPOCH_Q_BASE + 4; // BIDIR Tunneling Kernel Space diff --git a/tt_metal/hw/toolchain/substitutes.cpp b/tt_metal/hw/toolchain/substitutes.cpp index a4e5feb40a0c..45764316f8ce 100644 --- a/tt_metal/hw/toolchain/substitutes.cpp +++ b/tt_metal/hw/toolchain/substitutes.cpp @@ -37,3 +37,34 @@ extern "C" void wzerorange(uint32_t* start, uint32_t* end) { start[-1] = 0; } } + +// Let the LTO decide if this needs to be inline. +void l1_to_local_mem_copy(uint32_t* dst, uint32_t __attribute__((rvtt_l1_ptr))* src, int32_t len) { +#pragma GCC unroll 0 + while (len >= 3) { + auto v0 = src[0], v1 = src[1], v2 = src[2]; + // 1) Make sure the optimizer does not think this is memcpy by + // hiding the pointer bookkeeping in an asm. + // 2) The scheduler doesn't know the above loads have 6 cycle + // latency. We emit the 3 bookkeeping adds as a single block + // in the load shadow before the stores. The optimizer will + // not be able to move these. + // 3) We don't need early clobbers here because of the +r + // constraint -- early clobbers would pessimize. + asm inline( + "addi %0,%0,3*%3\n\t" + "addi %1,%1,3*%3\n\t" + "addi %2,%2,-3" + : "+r"(src), "+r"(dst), "+r"(len) + : "i"(sizeof(v0))); + dst[-3] = v0, dst[-2] = v1, dst[-1] = v2; + } + // There are 0, 1 or 2 words of residue. This is smaller than a loop. + // We get smaller code layout by expecting the conditions to be true. + if (__builtin_expect(len >= 1, true)) { + dst[0] = src[0]; + if (__builtin_expect(len >= 2, true)) { + dst[1] = src[1]; + } + } +} diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index fe6953d4c230..c5a3c75660e3 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -7,7 +7,6 @@ #include "tt_metal/device.hpp" #include "common/core_coord.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/jit_build/genfiles.hpp" #include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/trace/trace.hpp" #include "tt_metal/common/core_descriptor.hpp" @@ -28,6 +27,7 @@ #include "tt_metal/impl/sub_device/sub_device_types.hpp" #include "tt_metal/tt_stl/span.hpp" #include "tt_metal/types.hpp" +#include "noc/noc_parameters.h" // FIXME: ARCH_NAME specific #include "eth_l1_address_map.h" @@ -407,13 +407,36 @@ void Device::build_firmware() { log_debug(tt::LogMetal, "Building base firmware for device {}", this->id_); ZoneScoped; - this->generate_device_headers(this->build_env_.get_out_firmware_root_path()); jit_build_set(this->firmware_build_states_, nullptr); } +void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord phys_core) +{ + const uint32_t dram_to_noc_sz_in_bytes = dram_bank_to_noc_xy_.size() * sizeof(uint16_t); + const uint32_t l1_to_noc_sz_in_bytes = l1_bank_to_noc_xy_.size() * sizeof(uint16_t); + const uint32_t dram_offset_sz_in_bytes = dram_bank_offset_map_.size() * sizeof(int32_t); + const uint32_t l1_offset_sz_in_bytes = l1_bank_offset_map_.size() * sizeof(int32_t); + + const uint64_t mem_bank_to_noc_addr = hal.get_dev_addr(core_type, HalL1MemAddrType::BANK_TO_NOC_SCRATCH); + const uint32_t mem_bank_to_noc_size = hal.get_dev_size(core_type, HalL1MemAddrType::BANK_TO_NOC_SCRATCH); + + TT_ASSERT((dram_to_noc_sz_in_bytes + l1_to_noc_sz_in_bytes + dram_offset_sz_in_bytes + l1_offset_sz_in_bytes) <= mem_bank_to_noc_size, + "Size of bank_to_noc table is greater than available space"); + + tt::Cluster::instance().write_core(&dram_bank_to_noc_xy_[0], dram_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), mem_bank_to_noc_addr); + uint64_t l1_noc_addr = mem_bank_to_noc_addr + dram_to_noc_sz_in_bytes; + tt::Cluster::instance().write_core(&l1_bank_to_noc_xy_[0], l1_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), l1_noc_addr); + + uint64_t dram_offset_addr = l1_noc_addr + l1_to_noc_sz_in_bytes; + tt::Cluster::instance().write_core(&dram_bank_offset_map_[0], dram_offset_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), dram_offset_addr); + uint64_t l1_offset_addr = dram_offset_addr + dram_offset_sz_in_bytes; + tt::Cluster::instance().write_core(&l1_bank_offset_map_[0], l1_offset_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), l1_offset_addr); +} + void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg) { ZoneScoped; + this->initialize_device_bank_to_noc_tables(core_type, phys_core); uint32_t core_type_idx = hal.get_programmable_core_type_index(core_type); uint32_t processor_class_count = hal.get_processor_classes_count(core_type); @@ -2948,6 +2971,7 @@ bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t t this->initialize_cluster(); this->initialize_default_sub_device_state(l1_small_size, trace_region_size, l1_bank_remap); this->initialize_build(); + this->generate_device_bank_to_noc_tables(); // For minimal setup, don't initialize FW, watcher, dprint. They won't work if we're attaching to a hung chip. if (minimal) @@ -3553,37 +3577,48 @@ void Device::MarkAllocationsSafe() { tt::tt_metal::allocator::mark_allocations_safe(*this->get_initialized_allocator()); } -void Device::generate_device_headers(const std::string &path) const +void Device::generate_device_bank_to_noc_tables() { const size_t num_dram_banks = this->num_banks(BufferType::DRAM); - const size_t num_dram_banks_pow2 = std::pow(2, std::ceil(std::log2(num_dram_banks))); std::vector dram_noc_coord_per_bank(num_dram_banks); - std::vector dram_offsets_per_bank(num_dram_banks); + dram_bank_offset_map_.clear(); + dram_bank_offset_map_.resize(num_dram_banks); for (unsigned bank_id = 0; bank_id < num_dram_banks; bank_id++) { dram_noc_coord_per_bank[bank_id] = this->dram_core_from_dram_channel(this->dram_channel_from_bank_id(bank_id)); - dram_offsets_per_bank[bank_id] = this->bank_offset(BufferType::DRAM, bank_id); + dram_bank_offset_map_[bank_id] = this->bank_offset(BufferType::DRAM, bank_id); } const size_t num_l1_banks = this->num_banks(BufferType::L1); - const size_t num_l1_banks_pow2 = std::pow(2, std::ceil(std::log2(num_l1_banks))); std::vector l1_noc_coord_per_bank(num_l1_banks); - std::vector l1_offset_per_bank(num_l1_banks); + l1_bank_offset_map_.clear(); + l1_bank_offset_map_.resize(num_l1_banks); for (unsigned bank_id = 0; bank_id < num_l1_banks; bank_id++) { l1_noc_coord_per_bank[bank_id] = this->worker_core_from_logical_core(this->logical_core_from_bank_id(bank_id)); - l1_offset_per_bank[bank_id] = this->bank_offset(BufferType::L1, bank_id); + l1_bank_offset_map_[bank_id] = this->bank_offset(BufferType::L1, bank_id); } const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(this->id()); - // Generate header file in proper location - jit_build_genfiles_bank_to_noc_coord_descriptor ( - path, - soc_d.grid_size, - dram_noc_coord_per_bank, - dram_offsets_per_bank, - l1_noc_coord_per_bank, - l1_offset_per_bank, - this->get_allocator_alignment() - ); + dram_bank_to_noc_xy_.clear(); + dram_bank_to_noc_xy_.reserve(tt::tt_metal::hal.get_num_nocs() * dram_noc_coord_per_bank.size()); + for (unsigned int noc = 0; noc < tt::tt_metal::hal.get_num_nocs(); noc++) { + for (unsigned int bank_id = 0; bank_id < dram_noc_coord_per_bank.size(); bank_id++) { + uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, dram_noc_coord_per_bank[bank_id].x); + uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, dram_noc_coord_per_bank[bank_id].y); + uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << NOC_COORD_REG_OFFSET; + dram_bank_to_noc_xy_.push_back(xy); + } + } + + l1_bank_to_noc_xy_.clear(); + l1_bank_to_noc_xy_.reserve(tt::tt_metal::hal.get_num_nocs() * l1_noc_coord_per_bank.size()); + for (unsigned int noc = 0; noc < tt::tt_metal::hal.get_num_nocs(); noc++) { + for (unsigned int bank_id = 0; bank_id < l1_noc_coord_per_bank.size(); bank_id++) { + uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, l1_noc_coord_per_bank[bank_id].x); + uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, l1_noc_coord_per_bank[bank_id].y); + uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << NOC_COORD_REG_OFFSET; + l1_bank_to_noc_xy_.push_back(xy); + } + } } size_t Device::get_device_kernel_defines_hash() { diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 045a1097aacc..616a831e0462 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -231,7 +231,7 @@ class Device { // machine inf float sfpu_inf() const; - void generate_device_headers(const std::string &path) const; + void generate_device_bank_to_noc_tables(); const JitBuildEnv& build_env() const { return this->build_env_; } const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const; const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const; @@ -259,6 +259,7 @@ class Device { void initialize_build(); void initialize_device_kernel_defines(); void build_firmware(); + void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord phys_core); void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg); void reset_cores(); void initialize_and_launch_firmware(); @@ -396,6 +397,11 @@ class Device { SubDeviceManagerId next_sub_device_manager_id_ = {0}; SubDeviceManagerId default_sub_device_manager_id_ = {0}; detail::SubDeviceManager *default_sub_device_manager_ = nullptr; + + std::vector dram_bank_offset_map_; + std::vector l1_bank_offset_map_; + std::vector dram_bank_to_noc_xy_; + std::vector l1_bank_to_noc_xy_; }; } // namespace v0 diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index d21d2c1735d2..a3f67470d217 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -335,7 +335,6 @@ void ComputeKernel::set_build_options(JitBuildOptions &build_options) const { void DataMovementKernel::generate_binaries(Device *device, JitBuildOptions &build_options) const { jit_build_genfiles_kernel_include(device->build_env(), *this, this->kernel_src_); - device->generate_device_headers(build_options.path); uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int riscv_id = static_cast::type>(this->config_.processor); @@ -344,7 +343,6 @@ void DataMovementKernel::generate_binaries(Device *device, JitBuildOptions &buil void EthernetKernel::generate_binaries(Device *device, JitBuildOptions &build_options) const { jit_build_genfiles_kernel_include(device->build_env(), *this, this->kernel_src_); - device->generate_device_headers(build_options.path); uint32_t erisc_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int erisc_id = magic_enum::enum_integer(this->config_.processor); diff --git a/tt_metal/jit_build/build.hpp b/tt_metal/jit_build/build.hpp index 45c153439f0d..ccd4a7860d25 100644 --- a/tt_metal/jit_build/build.hpp +++ b/tt_metal/jit_build/build.hpp @@ -50,7 +50,6 @@ class JitBuildEnv { tt::ARCH get_arch() const { return arch_; } const string& get_root_path() const { return root_; } const string& get_out_root_path() const { return out_root_; } - const string& get_out_firmware_root_path() const { return out_firmware_root_; } const string& get_out_kernel_root_path() const { return out_kernel_root_; } private: diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp index a008db74e1ea..ab920c1d1b05 100644 --- a/tt_metal/jit_build/genfiles.cpp +++ b/tt_metal/jit_build/genfiles.cpp @@ -451,128 +451,4 @@ void jit_build_genfiles_descriptors(const JitBuildEnv& env, JitBuildOptions& opt } } -std::string generate_bank_to_noc_coord_descriptor_string( - tt_xy_pair grid_size, - std::vector& dram_bank_map, - std::vector& dram_bank_offset_map, - std::vector& l1_bank_map, - std::vector& l1_bank_offset_map, - uint32_t allocator_alignment) { - stringstream ss; - - ss << "// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc." << endl; - ss << "//" << endl; - ss << "// SPDX-License-Identifier: Apache-2.0" << endl; - ss << endl; - ss << "/*" << endl; - ss << " * This file is autogenerated by tt-metal runtime" << endl; - ss << " * DO NOT EDIT" << endl; - ss << " * This file contains values that are visible to the device compiled code." << endl; - ss << " * CAREFUL: when included in the FW_BUILD, it defines global variables." << endl; - ss << " * When included in KERNEL_BUILD, it declares global variables." << endl; - ss << " */" << endl; - ss << endl; - ss << "#pragma once" << endl; - ss << endl; - ss << "#include " << endl; - ss << endl; - - ss << "static_assert(NUM_NOCS == 2);" << endl; - ss << endl; - - ss << "#ifdef KERNEL_BUILD" << endl; - ss << endl; - ss << "extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS];" << endl; - ss << "extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS];" << endl; - ss << "extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS];" << endl; - ss << "extern int32_t bank_to_l1_offset[NUM_L1_BANKS];" << endl; - - ss << endl; - ss << "#else // !KERNEL_BUILD (FW_BUILD)" << endl; - ss << endl; - - ss << "uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)) = {" << endl; - for (unsigned int noc = 0; noc < 2; noc++) { - ss << " {" - << "\t// noc=" << noc << endl; - for (unsigned int bank_id = 0; bank_id < dram_bank_map.size(); bank_id++) { - uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, grid_size.x, dram_bank_map[bank_id].x); - uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, grid_size.y, dram_bank_map[bank_id].y); - ss << " (((" << noc_y << " << NOC_ADDR_NODE_ID_BITS) | " << noc_x << ") << NOC_COORD_REG_OFFSET)," - << "\t// NOC_X=" << noc_x << " NOC_Y=" << noc_y << endl; - } - ss << " }," << endl; - } - ss << "};" << endl; - ss << endl; - ss << "int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)) = {" << endl; - for (unsigned int bank_id = 0; bank_id < dram_bank_map.size(); bank_id++) { - ss << " " << dram_bank_offset_map[bank_id] << "," << endl; - } - ss << "};" << endl; - ss << endl; - - ss << "uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)) = {" << endl; - for (unsigned int noc = 0; noc < 2; noc++) { - ss << " {" - << "\t// noc=" << noc << endl; - for (unsigned int bank_id = 0; bank_id < l1_bank_map.size(); bank_id++) { - uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, grid_size.x, l1_bank_map[bank_id].x); - uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, grid_size.y, l1_bank_map[bank_id].y); - ss << " (((" << noc_y << " << NOC_ADDR_NODE_ID_BITS) | " << noc_x << ") << NOC_COORD_REG_OFFSET)," - << "\t// NOC_X=" << noc_x << " NOC_Y=" << noc_y << endl; - } - ss << " }," << endl; - } - ss << "};" << endl; - ss << endl; - ss << "int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)) = {" << endl; - for (unsigned int bank_id = 0; bank_id < l1_bank_map.size(); bank_id++) { - ss << " " << l1_bank_offset_map[bank_id] << "," << endl; - } - ss << "};" << endl; - ss << endl; - - ss << "#endif // FW_BUILD" << endl; - - return ss.str(); -} -void jit_build_genfiles_bank_to_noc_coord_descriptor( - const string& path, - tt_xy_pair grid_size, - std::vector& dram_bank_map, - std::vector& dram_bank_offset_map, - std::vector& l1_bank_map, - std::vector& l1_bank_offset_map, - uint32_t allocator_alignment) { - string output_string = generate_bank_to_noc_coord_descriptor_string( - grid_size, - dram_bank_map, - dram_bank_offset_map, - l1_bank_map, - l1_bank_offset_map, - allocator_alignment); - - fs::create_directories(path + "/brisc"); - ofstream file_stream_br(path + "/brisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_br << output_string; - file_stream_br.close(); - fs::create_directories(path + "/ncrisc"); - ofstream file_stream_nc(path + "/ncrisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_nc << output_string; - file_stream_nc.close(); - fs::create_directories(path + "/erisc"); - ofstream file_stream_ec(path + "/erisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_ec << output_string; - file_stream_ec.close(); - fs::create_directories(path + "/idle_erisc"); - ofstream file_stream_iec(path + "/idle_erisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_iec << output_string; - file_stream_iec.close(); - fs::create_directories(path + "/slave_idle_erisc"); - ofstream file_stream_siec(path + "/slave_idle_erisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_siec << output_string; - file_stream_siec.close(); -} - } // namespace tt::tt_metal diff --git a/tt_metal/jit_build/genfiles.hpp b/tt_metal/jit_build/genfiles.hpp index 4dee07a44ab3..c21459daabda 100644 --- a/tt_metal/jit_build/genfiles.hpp +++ b/tt_metal/jit_build/genfiles.hpp @@ -21,15 +21,6 @@ void jit_build_genfiles_kernel_include( void jit_build_genfiles_triscs_src( const JitBuildEnv& env, const JitBuildSettings& settings, const KernelSource& kernel_src); -void jit_build_genfiles_bank_to_noc_coord_descriptor( - const std::string& path, - tt_xy_pair grid_size, - std::vector& dram_bank_map, - std::vector& dram_bank_offset_map, - std::vector& l1_bank_map, - std::vector& l1_bank_offset_map, - uint32_t allocator_alignment); - void jit_build_genfiles_descriptors(const JitBuildEnv& env, JitBuildOptions& options); } // namespace tt::tt_metal diff --git a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp index 021f58f10752..2fe01d1cd57d 100644 --- a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp @@ -46,6 +46,8 @@ HalCoreInfoType create_active_eth_mem_map() { GET_ETH_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::FW_VERSION_ADDR)] = eth_l1_mem::address_map::FW_VERSION_ADDR; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = + eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -65,6 +67,8 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(std::uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::FW_VERSION_ADDR)] = sizeof(std::uint32_t); + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = + eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumEthDispatchClasses - 1); std::vector processor_types(1); diff --git a/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp index f7f91ed7f441..72ba9e91a226 100644 --- a/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp @@ -49,6 +49,7 @@ HalCoreInfoType create_idle_eth_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::GO_MSG)] = GET_IERISC_MAILBOX_ADDRESS_HOST(go_message); mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_IERISC_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_IERISC_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -66,6 +67,7 @@ HalCoreInfoType create_idle_eth_mem_map() { ; mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(std::uint32_t); + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_IERISC_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumEthDispatchClasses); std::vector processor_types(1); diff --git a/tt_metal/llrt/blackhole/bh_hal_tensix.cpp b/tt_metal/llrt/blackhole/bh_hal_tensix.cpp index d0414dcfbc04..eb17f10bf112 100644 --- a/tt_metal/llrt/blackhole/bh_hal_tensix.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_tensix.cpp @@ -46,6 +46,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::LOCAL)] = MEM_LOCAL_BASE; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -62,6 +63,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::LOCAL)] = MEM_TRISC_LOCAL_SIZE; // TRISC, BRISC, or NCRISC? + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumTensixDispatchClasses); std::vector processor_types; diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp index 5477beeec650..71a889179b8d 100644 --- a/tt_metal/llrt/grayskull/gs_hal.cpp +++ b/tt_metal/llrt/grayskull/gs_hal.cpp @@ -61,6 +61,7 @@ void Hal::initialize_gs() { mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::LOCAL)] = MEM_LOCAL_BASE; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -77,6 +78,7 @@ void Hal::initialize_gs() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::LOCAL)] = MEM_TRISC_LOCAL_SIZE; // TRISC, BRISC, or NCRISC? + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumTensixDispatchClasses); std::vector processor_types; diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/llrt/hal.hpp index f7da19e2f976..80e880026961 100644 --- a/tt_metal/llrt/hal.hpp +++ b/tt_metal/llrt/hal.hpp @@ -51,6 +51,7 @@ enum class HalL1MemAddrType : uint8_t { LAUNCH_MSG_BUFFER_RD_PTR, FW_VERSION_ADDR, // Really only applicable to active eth core right now LOCAL, + BANK_TO_NOC_SCRATCH, COUNT // Keep this last so it always indicates number of enum options }; diff --git a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp index 0d1241020c5b..c0af4cc0bd72 100644 --- a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp @@ -43,6 +43,8 @@ HalCoreInfoType create_active_eth_mem_map() { GET_ETH_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::FW_VERSION_ADDR)] = eth_l1_mem::address_map::FW_VERSION_ADDR; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = + eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -62,6 +64,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::FW_VERSION_ADDR)] = sizeof(std::uint32_t); + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumEthDispatchClasses); std::vector processor_types(1); diff --git a/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp index a2ce00faf433..6a5b617a3d25 100644 --- a/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp @@ -49,6 +49,7 @@ HalCoreInfoType create_idle_eth_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::GO_MSG)] = GET_IERISC_MAILBOX_ADDRESS_HOST(go_message); mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_IERISC_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_IERISC_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -66,6 +67,7 @@ HalCoreInfoType create_idle_eth_mem_map() { ; mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(std::uint32_t); + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_IERISC_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumEthDispatchClasses); std::vector processor_types(1); diff --git a/tt_metal/llrt/wormhole/wh_hal_tensix.cpp b/tt_metal/llrt/wormhole/wh_hal_tensix.cpp index 7de8185bacb8..e4d6c42981e6 100644 --- a/tt_metal/llrt/wormhole/wh_hal_tensix.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_tensix.cpp @@ -47,6 +47,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::LOCAL)] = MEM_LOCAL_BASE; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -63,6 +64,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(std::uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::LOCAL)] = MEM_TRISC_LOCAL_SIZE; // TRISC, BRISC, or NCRISC? + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumTensixDispatchClasses); std::vector processor_types; From c71be2d27ced6ad5a54aea91947d1f99ae5693be Mon Sep 17 00:00:00 2001 From: "Jack (Xun) Cai" Date: Mon, 9 Dec 2024 11:33:29 -0500 Subject: [PATCH 06/59] #15816: Fix grid size error for flash decode gqa (#15817) ### Ticket Quick patch for #15816 ### Checklist - [x] Post commit CI passes: https://github.com/tenstorrent/tt-metal/actions/runs/12226827636 --- .../device/sdpa_decode_program_factory.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp index 93f918ff0929..7c09d0e4de08 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp @@ -132,12 +132,14 @@ operation::ProgramWithCallbacks sdpa_decode_multi_core( // balance the number of cores to use based on batch uint32_t max_num_cores_for_compute = program_config->max_cores_per_head_batch * B * num_kv_heads; uint32_t num_cores_per_batch = std::min(num_cores_available, max_num_cores_for_compute) / B; - uint32_t num_active_cores = num_cores_per_batch * B; //// for core assignment, it is the same whether there's 1 core for head or 1 core for many heads uint32_t num_cores_per_head = std::max((uint32_t)1, num_cores_per_batch / num_kv_heads); - uint32_t num_heads_per_core = std::max((uint32_t)1, num_kv_heads / num_cores_per_batch); + uint32_t num_heads_per_core = std::max((uint32_t)1, (uint32_t)std::ceil((float)num_kv_heads / num_cores_per_batch)); uint32_t num_reducer_cores = num_kv_heads * B / num_heads_per_core; uint32_t num_output_cores = B; + uint32_t num_active_cores = num_cores_per_head * num_kv_heads * B / num_heads_per_core; + //// recalculate num_cores_per_batch based on num_active_cores + num_cores_per_batch = num_active_cores / B; TT_FATAL( ((num_cores_per_head >= 1) && (num_heads_per_core == 1)) || @@ -146,10 +148,10 @@ operation::ProgramWithCallbacks sdpa_decode_multi_core( // create core group, assume n batch and k_heads: // this is a 1D list of cores sorted by batch_output1, worker, ..., batch_output2, worker, ..., batch_output n, - // worker, ... Within each batch, we will assign head reducers. e.g. the following mapping: (batch_output1, worker1, - // worker2), (worker3, worker4, worker5), ..., (... worker3*k-1, worker3*k) (head_reducer1, h_worker1, - // h_worker2), (head_reducer2, h_worker1, h_worker2), ..., (head_reducerk, h_worker1, h_worker2) head_reducer2 to - // head_reducerk then send the result to head_reducer1, which is also the batch_output1 + // worker, ... Within each batch, we will assign head reducers. e.g. the following mapping: + // (batch_output1, worker1, worker2), (worker3, worker4, worker5), ..., (... worker3*k-1, worker3*k) + // (head_reducer1, h_worker1, h_worker2), (head_reducer2, h_worker1, h_worker2), ..., (head_reducerk, h_worker1, + // h_worker2) head_reducer2 to head_reducerk then send the result to head_reducer1, which is also the batch_output1 std::vector core_group; std::vector core_group_idle; if (is_q_sharded || is_output_sharded) { From 9bcc739f4a02fb12f2fa2f26a4b6eabdb952ff55 Mon Sep 17 00:00:00 2001 From: Nilaykumar Patel Date: Mon, 9 Dec 2024 22:40:00 +0530 Subject: [PATCH 07/59] Align ttnn version of upsample to torch/tf. (#15580) ### Ticket [Link](https://github.com/tenstorrent/tt-metal/issues/15158) ### Problem description Current upsample interface allows user to pass more than 2 scaling factor. For 2D spacial data that is not necessary. ### What's changed Restrict scale factor to 1 or 2. Signed-off-by: Nilaykumar Patel --- .../segformer/tt/ttnn_segformer_decode_head.py | 2 +- models/demos/yolov4/ttnn/neck.py | 4 ++-- .../functional_unet/tt/unet_shallow_ttnn.py | 2 +- tests/ttnn/sweep_tests/sweeps/sweeps/upsample.py | 3 +-- tests/ttnn/unit_tests/operations/test_upsample.py | 8 +++----- .../ttnn/operations/pool/upsample/upsample.cpp | 15 +-------------- .../ttnn/operations/pool/upsample/upsample.hpp | 2 +- .../operations/pool/upsample/upsample_pybind.cpp | 2 +- 8 files changed, 11 insertions(+), 27 deletions(-) diff --git a/models/demos/segformer/tt/ttnn_segformer_decode_head.py b/models/demos/segformer/tt/ttnn_segformer_decode_head.py index 6aed216c5787..4be9a957a8ce 100644 --- a/models/demos/segformer/tt/ttnn_segformer_decode_head.py +++ b/models/demos/segformer/tt/ttnn_segformer_decode_head.py @@ -78,7 +78,7 @@ def __call__(self, encoder_hidden_states: ttnn.bfloat8_b, parameters) -> ttnn.Te encoder_hidden_state = ttnn.upsample( encoder_hidden_state, - scale_factor=(128 // encoder_hidden_state.shape[2], 128 // encoder_hidden_state.shape[2], 1), + scale_factor=(128 // encoder_hidden_state.shape[2], 128 // encoder_hidden_state.shape[2]), mode="bilinear", ) diff --git a/models/demos/yolov4/ttnn/neck.py b/models/demos/yolov4/ttnn/neck.py index f7e3b5412785..d86d9faa5272 100644 --- a/models/demos/yolov4/ttnn/neck.py +++ b/models/demos/yolov4/ttnn/neck.py @@ -262,7 +262,7 @@ def __call__(self, device, input_tensor): ttnn.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec ) - output_tensor_upsample_1 = ttnn.upsample(output_tensor, (2, 2, 1), memory_config=out_sharded_mem_config) + output_tensor_upsample_1 = ttnn.upsample(output_tensor, (2, 2), memory_config=out_sharded_mem_config) output_tensor_upsample_1 = ttnn.sharded_to_interleaved(output_tensor_upsample_1, ttnn.L1_MEMORY_CONFIG) output_tensor_upsample_1 = ttnn.reshape(output_tensor_upsample_1, (1, 1, 400, 256)) output_tensor_upsample_1 = ttnn.to_layout(output_tensor_upsample_1, layout=ttnn.TILE_LAYOUT) @@ -336,7 +336,7 @@ def __call__(self, device, input_tensor): ttnn.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec ) - output_tensor_upsample_2 = ttnn.upsample(output_tensor, (2, 2, 1), memory_config=out_sharded_mem_config) + output_tensor_upsample_2 = ttnn.upsample(output_tensor, (2, 2), memory_config=out_sharded_mem_config) output_tensor_upsample_2 = ttnn.sharded_to_interleaved(output_tensor_upsample_2, ttnn.L1_MEMORY_CONFIG) output_tensor_upsample_2 = ttnn.reshape(output_tensor_upsample_2, (1, 1, 1600, 128)) output_tensor_upsample_2 = ttnn.to_layout(output_tensor_upsample_2, ttnn.TILE_LAYOUT) diff --git a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py index fb087ad92795..95eff2e07764 100644 --- a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py +++ b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py @@ -262,7 +262,7 @@ def upsample(self, x): else: x = ttnn.interleaved_to_sharded(x, shardspec) - x = ttnn.upsample(x, (2, 2, 1), memory_config=x.memory_config()) + x = ttnn.upsample(x, (2, 2), memory_config=x.memory_config()) x = ttnn.reshape( x, (1, 1, self.conv1.batch_size * self.conv1.input_height * self.conv1.input_width, x.shape[-1]) ) diff --git a/tests/ttnn/sweep_tests/sweeps/sweeps/upsample.py b/tests/ttnn/sweep_tests/sweeps/sweeps/upsample.py index 0b752e7cef29..88ae18ccd9c2 100644 --- a/tests/ttnn/sweep_tests/sweeps/sweeps/upsample.py +++ b/tests/ttnn/sweep_tests/sweeps/sweeps/upsample.py @@ -37,8 +37,7 @@ def run( torch_result = m(tt_input) torch_result = torch_result.permute(0, 2, 3, 1) - ## ttnn uses NHWC, so need to set scale_factor_c = 1 - scale_factor = (scale_h, scale_w, 1) + scale_factor = (scale_h, scale_w) input_tensor = ttnn.from_torch(input, device=device) output_tensor = ttnn.upsample(input_tensor, scale_factor) output_tensor = ttnn.to_torch(output_tensor) diff --git a/tests/ttnn/unit_tests/operations/test_upsample.py b/tests/ttnn/unit_tests/operations/test_upsample.py index 86047a865814..e4a8846e3fc0 100644 --- a/tests/ttnn/unit_tests/operations/test_upsample.py +++ b/tests/ttnn/unit_tests/operations/test_upsample.py @@ -83,7 +83,7 @@ def test_upsample_single_core(device, input_shapes, scale_h, scale_w): torch_result = torch_result.permute(0, 2, 3, 1) ## ttnn uses NHWC, so need to set scale_factor_c = 1 - scale_factor = (scale_h, scale_w, 1) + scale_factor = (scale_h, scale_w) input_tensor = ttnn.from_torch(input, device=device) output_tensor = ttnn.upsample(input_tensor, scale_factor) output_tensor = ttnn.to_torch(output_tensor) @@ -204,8 +204,7 @@ def test_upsample_multi_core(device, input_shape, scale_h, scale_w, shard_strate print(f"in_shard_mem_config: {in_sharded_mem_config}") print(f"out_shard_mem_config: {out_sharded_mem_config}") - ## ttnn uses NHWC, so need to set scale_factor_c = 1 - scale_factor = (scale_h, scale_w, 1) + scale_factor = (scale_h, scale_w) input_tensor = ttnn.from_torch(tt_input, device=device, memory_config=ttnn.L1_MEMORY_CONFIG) input_tensor = ttnn.to_memory_config(input_tensor, memory_config=in_sharded_mem_config) output_tensor = ttnn.upsample(input_tensor, scale_factor, memory_config=out_sharded_mem_config) @@ -337,8 +336,7 @@ def test_bilinear_multi_core( logger.debug(f"in_shard_mem_config: {in_sharded_mem_config}") logger.debug(f"out_shard_mem_config: {out_sharded_mem_config}") - ## ttnn uses NHWC, so need to set scale_factor_c = 1 - scale_factor = (scale_h, scale_w, 1) + scale_factor = (scale_h, scale_w) input_tensor = ttnn.from_torch(tt_input, device=device) input_tensor = ttnn.to_memory_config(input_tensor, memory_config=in_sharded_mem_config) output_tensor = ttnn.upsample( diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp index dbd111f0358d..576a237a7db2 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp @@ -10,7 +10,7 @@ namespace ttnn::operations::upsample { ttnn::Tensor ExecuteUpSample::invoke( const ttnn::Tensor& input_tensor, - std::variant scale_factor, + std::variant scale_factor, const std::string& mode, const std::optional& output_mem_config, const std::optional& compute_kernel_config) { @@ -27,21 +27,8 @@ ttnn::Tensor ExecuteUpSample::invoke( scale_h = sf; scale_w = sf; } else if constexpr (std::is_same_v) { - scale_w = sf.at(0); - int scale_c = sf.at(1); - TT_FATAL(scale_c == 1, "Error"); - } else if constexpr (std::is_same_v) { scale_h = sf.at(0); scale_w = sf.at(1); - int scale_c = sf.at(2); - TT_FATAL(scale_c == 1, "Error"); - } else if constexpr (std::is_same_v) { - int scale_n = sf.at(0); - scale_h = sf.at(1); - scale_w = sf.at(2); - int scale_c = sf.at(3); - TT_FATAL(scale_n == 1, "Error"); - TT_FATAL(scale_c == 1, "Error"); } else { // static_assert(false, "Unsupported scale factor"); static_assert(sizeof(T) != 0, "Type check failed."); diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/upsample.hpp b/ttnn/cpp/ttnn/operations/pool/upsample/upsample.hpp index e8bd68e634af..0a012304548e 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/upsample.hpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/upsample.hpp @@ -15,7 +15,7 @@ namespace upsample { struct ExecuteUpSample { static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, - std::variant scale_factor, + std::variant scale_factor, const std::string& mode = std::string("nearest"), const std::optional& output_mem_config = std::nullopt, const std::optional& compute_kernel_config = std::nullopt); diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/upsample_pybind.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/upsample_pybind.cpp index 93d4137cd70a..06c72788e1ff 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/upsample_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/upsample_pybind.cpp @@ -25,7 +25,7 @@ void bind_upsample(py::module& module) { Args: input_tensor (ttnn.Tensor): the input tensor. - scale_factor (int or tt::tt_metal::Array2D or tt::tt_metal::Array3D or tt::tt_metal::Array4D): multiplier for spatial size. Has to match input size if it is a tuple. + scale_factor (int or tt::tt_metal::Array2D): multiplier for spatial size. Keyword args: From 4b6e84472d5fede53427b9302697e71b4b6877bc Mon Sep 17 00:00:00 2001 From: Evan Smal Date: Thu, 5 Dec 2024 16:15:39 +0000 Subject: [PATCH 08/59] Clean up Stable Diffusion tests and CI jobs Refactor the Stable Diffusion test suite and CI jobs to improve model stability and maintainability. - Move all Stable Diffusion tests into the model directory for better organization. - Remove unnecessary postfixes from test filenames. - Delete dead test code, including tests unrelated to SD modules and code for deprecated SD variants using 224x224 input shapes. - Re-enable previously broken tests in CI. - Update the CODEOWNERS file to reflect moved/deleted directories and files. --- ...atch-full-regressions-and-models-impl.yaml | 4 +- CODEOWNERS | 4 - .../test_multiple_iterations.py | 236 ----- .../tests}/test_basic_transformer_block.py | 0 .../tests}/test_cross_attention.py | 0 .../tests/test_cross_attn_up_block_2d.py | 0 .../stable_diffusion/tests}/test_demo.py | 0 .../stable_diffusion/tests}/test_embedding.py | 0 .../tests}/test_feedforward.py | 0 .../stable_diffusion/tests}/test_geglu.py | 0 .../stable_diffusion/tests/test_perf.py | 0 .../tests/test_resnet_block_2d.py | 84 -- .../tests}/test_sharded_matmuls.py | 0 .../tests/test_transformer_2d_model.py | 0 .../tests/test_unet_2d_condition_model.py | 3 +- .../stable_diffusion/tests/test_upblock_2d.py | 0 .../tests/test_upsample_2d.py | 0 .../tests}/test_upsample_nearest_2d.py | 0 .../test_basic_transformer_block.py | 1 + .../stable_diffusion/test_cross_attention.py | 1 + .../test_cross_attn_up_block_2d.py | 1 + .../single_card/stable_diffusion/test_demo.py | 1 + .../stable_diffusion/test_embedding.py | 1 + .../stable_diffusion/test_feedforward.py | 1 + .../stable_diffusion/test_geglu.py | 1 + .../stable_diffusion/test_resnet_block_2d.py | 1 + .../stable_diffusion/test_sharded_matmuls.py | 1 + .../test_transformer_2d_model.py | 1 + .../test_unet_2d_condition_model.py | 1 + .../stable_diffusion/test_upblock_2d.py | 1 + .../stable_diffusion/test_upsample_2d.py | 1 + .../test_upsample_nearest_2d.py | 1 + .../ttnn/integration_tests/stable_diffusion | 1 - tests/scripts/run_performance.sh | 4 +- .../test_sharded_attention.py | 966 ------------------ 35 files changed, 19 insertions(+), 1297 deletions(-) delete mode 100644 models/demos/wormhole/stable_diffusion/test_multiple_iterations.py rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_basic_transformer_block.py (100%) rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_cross_attention.py (100%) rename tests/ttnn/integration_tests/stable_diffusion/test_cross_attn_up_block_2d_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py (100%) rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_demo.py (100%) rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_embedding.py (100%) rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_feedforward.py (100%) rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_geglu.py (100%) rename tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py => models/demos/wormhole/stable_diffusion/tests/test_perf.py (100%) rename tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py (66%) rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_sharded_matmuls.py (100%) rename tests/ttnn/integration_tests/stable_diffusion/test_transformer_2d_model_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py (100%) rename tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py (98%) rename tests/ttnn/integration_tests/stable_diffusion/test_upblock_2d_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py (100%) rename tests/ttnn/integration_tests/stable_diffusion/test_upsample_2d_new_conv.py => models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py (100%) rename {tests/ttnn/integration_tests/stable_diffusion => models/demos/wormhole/stable_diffusion/tests}/test_upsample_nearest_2d.py (100%) create mode 120000 tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_cross_attention.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_demo.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_embedding.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_feedforward.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_geglu.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_upblock_2d.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_upsample_2d.py create mode 120000 tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py delete mode 120000 tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion delete mode 100644 tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml index 762324fb3a34..0af646345b18 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml @@ -149,8 +149,8 @@ jobs: fail-fast: false matrix: test-config: - - model: "wh_b0_unstable" - cmd: ./tests/scripts/single_card/nightly/run_wh_b0_unstable.sh + - model: "stable_diffusion" + cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion - model: "mamba 1" cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1 - model: "mamba 2" diff --git a/CODEOWNERS b/CODEOWNERS index aa80b7671c43..3b74d00a0470 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -173,10 +173,6 @@ tests/**/dtx/ @mywoodstock @sankarmanoj-tt tests/**/*test*conv*.py @mywoodstock @sankarmanoj-tt tests/python_api_testing/conv/ @mywoodstock @sankarmanoj-tt tests/python_api_testing/unit_testing/fallback_ops @tt-aho -tests/ttnn/integration_tests/stable_diffusion @esmalTT @uaydonat @mywoodstock -tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py @esmalTT @uaydonat @mywoodstock -tests/ttnn/integration_tests/unet @esmalTT @uaydonat @mywoodstock -tests/nightly/wh_b0_only_eth/experimental/functional_unet @esmalTT @uaydonat @mywoodstock scripts/profiler/ @mo-tenstorrent scripts/docker @tenstorrent/metalium-developers-infra diff --git a/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py b/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py deleted file mode 100644 index 8db6aee6f39e..000000000000 --- a/models/demos/wormhole/stable_diffusion/test_multiple_iterations.py +++ /dev/null @@ -1,236 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import ttnn -import json -import torch -import pytest -import numpy as np -from PIL import Image -from loguru import logger -from tqdm.auto import tqdm -from datasets import load_dataset - -from transformers import CLIPTextModel, CLIPTokenizer -from diffusers import ( - AutoencoderKL, - UNet2DConditionModel, - LMSDiscreteScheduler, -) -from models.utility_functions import ( - comp_allclose_and_pcc, - enable_persistent_kernel_cache, - disable_persistent_kernel_cache, -) -from models.utility_functions import skip_for_wormhole_b0 -from ttnn.model_preprocessing import preprocess_model_parameters -from models.demos.wormhole.stable_diffusion.custom_preprocessing import custom_preprocessor -from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_unet_2d_condition_model import ( - UNet2DConditionModel as UNet2D, -) - -from torchvision.transforms import ToTensor - - -def load_inputs(input_path): - with open(input_path) as f: - input_data = json.load(f) - assert input_data, "Input data is empty." - prompt = [item["prompt"] for item in input_data] - return prompt - - -def constant_prop_time_embeddings(timesteps, sample, time_proj): - timesteps = timesteps[None] - timesteps = timesteps.expand(sample.shape[0]) - t_emb = time_proj(timesteps) - return t_emb - - -def save_image_and_latents(latents, iter, vae, pre_fix="", pre_fix2=""): - pre_fix = "" if pre_fix == "" else f"{pre_fix}_" - pre_fix2 = "" if pre_fix2 == "" else f"{pre_fix2}_" - _latents = 1 / 0.18215 * latents - - with torch.no_grad(): - image = vae.decode(_latents).sample - # Image post-processing - image = (image / 2 + 0.5).clamp(0, 1) - image = image.detach().cpu().permute(0, 2, 3, 1).numpy() - images = (image * 255).round().astype("uint8") - pil_images = [Image.fromarray(image) for image in images][0] - pil_images.save(f"{pre_fix}{pre_fix2}image_iter_{iter}.png") - - -def guide(noise_pred, guidance_scale, t): # will return latents - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - return noise_pred - - -def latent_expansion(latents, scheduler, t): - latent_model_input = torch.cat([latents] * 2, dim=0) - latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t) - return latent_model_input - - -def calculate_fid_score(imgs_path1, imgs_path2): - fid = FrechetInceptionDistance(normalize=True) - fid.update(imgs_path1, real=False) - fid.update(imgs_path2, real=True) - return fid.compute() - - -def preprocess_images(image_paths): - images = [] - for image_path in image_paths: - image = Image.open(image_path) - image = image.convert("RGB") - image = image.resize((299, 299)) - image = ToTensor()(image) - images.append(image) - return torch.stack(images) - - -def run_demo_inference_diffusiondb(device, reset_seeds, input_path, num_inference_steps, image_size): - disable_persistent_kernel_cache() - - height, width = image_size - - experiment_name = f"diffusiondb_{height}x{width}" - input_prompt = [ - "oil painting frame of Breathtaking mountain range with a clear river running through it, surrounded by tall trees and misty clouds, serene, peaceful, mountain landscape, high detail" - ] - logger.info(f"input_prompts: {input_prompt}") - - # 1. Load the autoencoder model which will be used to decode the latents into image space. - vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae") - - # 2. Load the tokenizer and text encoder to tokenize and encode the text. - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") - text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") - - # 3. The UNet model for generating the latents. - unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet") - - # 4. load the K-LMS scheduler with some fitting parameters. - ttnn_scheduler = LMSDiscreteScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - num_train_timesteps=1000, - ) - - torch_device = "cpu" - vae.to(torch_device) - text_encoder.to(torch_device) - unet.to(torch_device) - - guidance_scale = 7.5 # Scale for classifier-free guidance - generator = torch.manual_seed(174) # 10233 Seed generator to create the inital latent noise - batch_size = len(input_prompt) - - ## First, we get the text_embeddings for the prompt. These embeddings will be used to condition the UNet model. - # Tokenizer and Text Encoder - text_input = tokenizer( - input_prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0] - max_length = text_input.input_ids.shape[-1] - uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt") - uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0] - - # For classifier-free guidance, we need to do two forward passes: one with the conditioned input (text_embeddings), - # and another with the unconditional embeddings (uncond_embeddings). - # In practice, we can concatenate both into a single batch to avoid doing two forward passes. - text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) - ttnn_text_embeddings = ttnn.from_torch(text_embeddings, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) - - vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1) - # Initial random noise - latents = torch.randn( - (batch_size, unet.config.in_channels, height // vae_scale_factor, width // vae_scale_factor), - generator=generator, - ) - latents = latents.to(torch_device) - - ttnn_scheduler.set_timesteps(num_inference_steps) - - latents = latents * ttnn_scheduler.init_noise_sigma - ttnn_latents = torch.tensor(latents) - - iter = 0 - config = unet.config - - parameters = preprocess_model_parameters( - initialize_model=lambda: unet, custom_preprocessor=custom_preprocessor, device=device - ) - input_height = 64 - input_width = 64 - reader_patterns_cache = {} if height == 512 and width == 512 else None - - model = UNet2D(device, parameters, 2, input_height, input_width, reader_patterns_cache) - # # Denoising loop - for t in tqdm(ttnn_scheduler.timesteps): - # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. - ttnn_latent_model_input = latent_expansion(ttnn_latents, ttnn_scheduler, t) - ttnn_latent_model_input = ttnn.from_torch( - ttnn_latent_model_input, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device - ) - - _t = constant_prop_time_embeddings(t, ttnn_latent_model_input, unet.time_proj) - _t = _t.unsqueeze(0).unsqueeze(0) - _t = ttnn.from_torch(_t, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) - - # predict the noise residual - with torch.no_grad(): - ttnn_output = model( - ttnn_latent_model_input, # input - timestep=_t, - encoder_hidden_states=ttnn_text_embeddings, - class_labels=None, - attention_mask=None, - cross_attention_kwargs=None, - return_dict=True, - config=config, - ) - noise_pred = ttnn.to_torch(ttnn_output) - - # perform guidance - noise_pred = guide(noise_pred, guidance_scale, t) - - ttnn_latents = ttnn_scheduler.step(noise_pred, t, ttnn_latents).prev_sample - save_image_and_latents(ttnn_latents, iter, vae, pre_fix=f"{experiment_name}_tt", pre_fix2="") - - iter += 1 - enable_persistent_kernel_cache() - - latents = ttnn_latents - # scale and decode the image latents with vae - latents = 1 / 0.18215 * latents - with torch.no_grad(): - image = vae.decode(latents).sample - - # Image post-processing - image = (image / 2 + 0.5).clamp(0, 1) - image = image.detach().cpu().permute(0, 2, 3, 1).numpy() - images = (image * 255).round().astype("uint8") - pil_images = [Image.fromarray(image) for image in images][0] - ttnn_output_path = f"{experiment_name}_ttnn.png" - pil_images.save(ttnn_output_path) - - ref_paths = [ref_img_path, ref_img_path] - ttnn_paths = [ttnn_output_path, ttnn_output_path] - - ref_images = preprocess_images(ref_paths) - ttnn_images = preprocess_images(ttnn_paths) - - -def test_tt2_multiple_iteration(device, reset_seeds, input_path): - # 30 iterations, generate 512x512 image - return run_demo_inference_diffusiondb(device, reset_seeds, input_path, 30, (512, 512)) diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_basic_transformer_block.py b/models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_basic_transformer_block.py rename to models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_cross_attention.py rename to models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_cross_attn_up_block_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_cross_attn_up_block_2d_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_demo.py b/models/demos/wormhole/stable_diffusion/tests/test_demo.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_demo.py rename to models/demos/wormhole/stable_diffusion/tests/test_demo.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_embedding.py b/models/demos/wormhole/stable_diffusion/tests/test_embedding.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_embedding.py rename to models/demos/wormhole/stable_diffusion/tests/test_embedding.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_feedforward.py b/models/demos/wormhole/stable_diffusion/tests/test_feedforward.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_feedforward.py rename to models/demos/wormhole/stable_diffusion/tests/test_feedforward.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_geglu.py b/models/demos/wormhole/stable_diffusion/tests/test_geglu.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_geglu.py rename to models/demos/wormhole/stable_diffusion/tests/test_geglu.py diff --git a/tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py b/models/demos/wormhole/stable_diffusion/tests/test_perf.py similarity index 100% rename from tests/device_perf_tests/stable_diffusion/test_perf_stable_diffusion.py rename to models/demos/wormhole/stable_diffusion/tests/test_perf.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py similarity index 66% rename from tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py index 51afb5afd0d2..91a0f3755e51 100644 --- a/tests/ttnn/integration_tests/stable_diffusion/test_resnet_block_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py @@ -25,90 +25,6 @@ def ttnn_to_torch(input): return input -@skip_for_grayskull() -@pytest.mark.parametrize( - "batch_size, in_channels, input_height, input_width, index1,index2,block_name,out_channels", - [ - (2, 320, 32, 32, 0, 0, "down", None), - (2, 320, 16, 16, 0, 0, "down", None), - (2, 640, 16, 16, 1, 1, "down", None), - (2, 640, 8, 8, 1, 1, "down", None), - (2, 1280, 8, 8, 2, 1, "down", None), - (2, 1280, 4, 4, 2, 1, "down", None), - (2, 2560, 4, 4, 0, 0, "up", 1280), - (2, 2560, 8, 8, 0, 0, "up", 1280), - (2, 1920, 8, 8, 2, 0, "up", 640), - (2, 1920, 16, 16, 2, 0, "up", 640), - (2, 1280, 16, 16, 3, 0, "down", None), - (2, 960, 16, 16, 3, 0, "up", 320), - (2, 960, 32, 32, 3, 0, "up", 320), - (2, 640, 32, 32, 3, 1, "up", 320), - ], -) -def test_resnet_block_2d_256x256( - device, batch_size, in_channels, input_height, input_width, index1, index2, block_name, out_channels -): - pytest.skip() - # setup pytorch model - model_name = "CompVis/stable-diffusion-v1-4" - pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=torch.float32) - - model = pipe.unet - model.eval() - - parameters = preprocess_model_parameters( - model_name=model_name, initialize_model=lambda: model, custom_preprocessor=custom_preprocessor, device=device - ) - - if block_name == "up": - parameters = parameters.up_blocks[index1].resnets[index2] - resnet = pipe.unet.up_blocks[index1].resnets[index2] - elif block_name == "down": - parameters = parameters.down_blocks[index1].resnets[index2] - resnet = pipe.unet.down_blocks[index1].resnets[index2] - else: - parameters = parameters.mid_block.resnets[index2] - resnet = pipe.unet.mid_block.resnets[index2] - - ############ start of residual block ############# - temb_channels = 1280 - groups = 32 - time_embedding_norm = "default" - output_scale_factor = 1 - use_in_shortcut = None - ########## end of residual block ############# - hidden_states_shape = [batch_size, in_channels, input_height, input_width] - temb_shape = [1, 1, 2, 1280] - - input = torch.randn(hidden_states_shape) - temb = torch.randn(temb_shape) - - torch_output = resnet(input, temb.squeeze(0).squeeze(0)) - - input = ttnn.from_torch(input, ttnn.bfloat16) - input = ttnn.to_layout(input, ttnn.TILE_LAYOUT) - input = ttnn.to_device(input, device, memory_config=ttnn.L1_MEMORY_CONFIG) - - temb = ttnn.from_torch(temb, ttnn.bfloat16) - temb = ttnn.to_layout(temb, ttnn.TILE_LAYOUT) - temb = ttnn.to_device(temb, device, memory_config=ttnn.L1_MEMORY_CONFIG) - ttnn_output = resnetBlock2D( - input, - temb=temb, - temb_channels=temb_channels, - time_embedding_norm=time_embedding_norm, - in_channels=in_channels, - out_channels=out_channels, - use_in_shortcut=use_in_shortcut, - groups=groups, - output_scale_factor=output_scale_factor, - parameters=parameters, - device=device, - ) - ttnn_output = ttnn_to_torch(ttnn_output) - assert_with_pcc(torch_output, ttnn_output, pcc=0.99) - - @skip_for_grayskull() @pytest.mark.parametrize("device_params", [{"l1_small_size": 32768}], indirect=True) @pytest.mark.parametrize( diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_matmuls.py b/models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_sharded_matmuls.py rename to models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_transformer_2d_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_transformer_2d_model_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py similarity index 98% rename from tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py index 35b1253ea540..72efdb4e178e 100644 --- a/tests/ttnn/integration_tests/stable_diffusion/test_unet_2d_condition_model_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py @@ -63,7 +63,6 @@ def unsqueeze_all_params_to_4d(params): @skip_for_grayskull() -@pytest.mark.skipif(is_wormhole_b0() or is_blackhole(), reason="#10923: CB / L1 buffer clash") @pytest.mark.parametrize( "device_params", [{"l1_small_size": 32768}], ids=["device_params=l1_small_size_24576"], indirect=True ) @@ -204,7 +203,7 @@ def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_ # print(iter) # print(f"Time taken for 50 iterations: {total_time}") # print(f"Samples per second: {50 / total_time}") - passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.99) + passing, output = comp_pcc(torch_output, ttnn_output, pcc=0.981) print(output) assert passing diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upblock_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_upblock_2d_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_upsample_2d_new_conv.py rename to models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_upsample_nearest_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py similarity index 100% rename from tests/ttnn/integration_tests/stable_diffusion/test_upsample_nearest_2d.py rename to models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py diff --git a/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py b/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py new file mode 120000 index 000000000000..61408ffa9e72 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_basic_transformer_block.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_basic_transformer_block.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_cross_attention.py b/tests/nightly/single_card/stable_diffusion/test_cross_attention.py new file mode 120000 index 000000000000..c161012b8867 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_cross_attention.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_cross_attention.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py b/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py new file mode 120000 index 000000000000..8fce2d91ed28 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_cross_attn_up_block_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_cross_attn_up_block_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_demo.py b/tests/nightly/single_card/stable_diffusion/test_demo.py new file mode 120000 index 000000000000..c375047f6338 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_demo.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_demo.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_embedding.py b/tests/nightly/single_card/stable_diffusion/test_embedding.py new file mode 120000 index 000000000000..3e89c1284247 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_embedding.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_embedding.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_feedforward.py b/tests/nightly/single_card/stable_diffusion/test_feedforward.py new file mode 120000 index 000000000000..915332488d58 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_feedforward.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_feedforward.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_geglu.py b/tests/nightly/single_card/stable_diffusion/test_geglu.py new file mode 120000 index 000000000000..5880ea6e17d9 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_geglu.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_geglu.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py b/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py new file mode 120000 index 000000000000..1b6513e5b502 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_resnet_block_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_resnet_block_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py b/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py new file mode 120000 index 000000000000..d5d12d47849c --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_sharded_matmuls.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_sharded_matmuls.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py b/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py new file mode 120000 index 000000000000..d82d4a899f64 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_transformer_2d_model.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_transformer_2d_model.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py b/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py new file mode 120000 index 000000000000..c25a861ed357 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_unet_2d_condition_model.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py b/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py new file mode 120000 index 000000000000..3997b30be69c --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_upblock_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py b/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py new file mode 120000 index 000000000000..88a986498448 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_upsample_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_upsample_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py b/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py new file mode 120000 index 000000000000..815ccb622b42 --- /dev/null +++ b/tests/nightly/single_card/stable_diffusion/test_upsample_nearest_2d.py @@ -0,0 +1 @@ +../../../../models/demos/wormhole/stable_diffusion/tests/test_upsample_nearest_2d.py \ No newline at end of file diff --git a/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion b/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion deleted file mode 120000 index 608e08f48e29..000000000000 --- a/tests/nightly/single_card/wh_b0_unstable/tests/ttnn/integration_tests/stable_diffusion +++ /dev/null @@ -1 +0,0 @@ -../../../../../../../tests/ttnn/integration_tests/stable_diffusion \ No newline at end of file diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index 93f22682e18c..7c42512474db 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -73,7 +73,7 @@ run_perf_models_cnn_javelin() { # Run tests env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests/test_unet_perf.py -m $test_marker - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=480 ## Merge all the generated reports env python models/perf/merge_perf_results.py @@ -83,7 +83,7 @@ run_device_perf_models() { set -eo pipefail local test_marker=$1 - env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600 + env pytest models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=600 env pytest models/demos/distilbert/tests -m $test_marker diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py b/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py deleted file mode 100644 index 1b45761e11c8..000000000000 --- a/tests/ttnn/integration_tests/stable_diffusion/test_sharded_attention.py +++ /dev/null @@ -1,966 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 - -import torch -import math -import pytest -import ttnn - -from tests.ttnn.utils_for_testing import assert_with_pcc -from models.utility_functions import ( - comp_pcc, - tt2torch_tensor, - torch2tt_tensor, - is_wormhole_b0, - skip_for_grayskull, -) -from models.demos.wormhole.stable_diffusion.tt.ttnn_functional_utility_functions import ( - determine_largest_subblock_size, - determine_blocking, -) - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024]) -@pytest.mark.parametrize("num_slices", [16]) -@pytest.mark.parametrize("num_cores", [64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_time_sharded_attnention_hwb( - device, - seq_len, - num_slices, - num_cores, - num_heads, - data_format, - function_level_defaults, -): - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - grid_size = (8, 8) - - M = seq_len - K = 64 - N = seq_len - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - - height_sharded_mem_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - block_sharded_mem_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, - buffer_type=ttnn.BufferType.L1, - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - attn_weights_qkt = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch_sm = torch.nn.functional.softmax(attn_weights_qkt, dim=-1) - attn_weights_torch = attn_weights_torch_sm @ torch_value_layer - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - mm_out = torch2tt_tensor( - torch_output, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_output_block_shard_spec = [seq_len // 8, seq_len // 8] - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_output_height_shard_spec = [tiles_per_shard * 32, seq_len] - - heads_per_slice = num_heads // num_slices - for i in range(num_slices): - q_slice = ttnn.interleaved_to_sharded_partial( - reference_query_layer, - ttnn.CoreCoord(1, grid_size[0]), - [M // grid_size[0], K], - num_slices, - i, - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - k_slice = ttnn.interleaved_to_sharded_partial( - reference_key_layer_transposed, - ttnn.CoreCoord(grid_size[1], 1), - [K, N // grid_size[1]], - num_slices, - i, - ttnn.TensorMemoryLayout.WIDTH_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=K // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=M // (32 * grid_size[0]), - per_core_N=N // (32 * grid_size[1]), - transpose_mcast=False, - fused_activation=None, - ) - - mm_slice = ttnn.matmul( - q_slice, - k_slice, - program_config=program_config, - memory_config=block_sharded_mem_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - # mmt = tt2torch_tensor(mm_slice) - # passed, message = comp_pcc(mmt, attn_weights_qkt[:, i * heads_per_slice : (i + 1) * heads_per_slice, :, :]) - # print(message) - # assert passed - k_slice.deallocate() - q_slice.deallocate() - - height_per_core = seq_len // 64 - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, seq_len], ttnn.ShardOrientation.ROW_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - mm_slice = ttnn.move(mm_slice) - - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=mm_output_height_shard_spec[0] // 32, - block_w=mm_output_height_shard_spec[1] // 32, - ) - # print(program_config) - - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - # mmt = tt2torch_tensor(mm_slice) - # passed, message = comp_pcc(mmt, attn_weights_torch_sm[:, i * heads_per_slice : (i + 1) * heads_per_slice, :, :]) - # print(message) - # assert passed - - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - per_core_M=tiles_per_shard, - per_core_N=2, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - v_slice = ttnn.slice( - reference_value_layer, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), seq_len, 64), - memory_config=dram_interleaved_memory_config, - ) - - mm_slice = ttnn.matmul( - mm_slice, - v_slice, - program_config=program_config, - memory_config=height_sharded_mem_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_slice.deallocate() - - ttnn.sharded_to_interleaved_partial( - mm_slice, - mm_out, - num_slices, - i, - memory_config=dram_interleaved_memory_config, - ) - - mm_slice.deallocate() - - mm_out_torch = tt2torch_tensor(mm_out) - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024]) -@pytest.mark.parametrize("num_slices", [16]) -@pytest.mark.parametrize("num_cores", [64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_time_sharded_attnention( - device, - seq_len, - num_slices, - num_cores, - num_heads, - data_format, - function_level_defaults, -): - pytest.skip() # ND hang on CI - compute_grid_size = device.compute_with_storage_grid_size() - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - grid_size = (8, 8) - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=True, - ) - - passing = True - output = None - - mm_out = torch2tt_tensor( - torch_output, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - tiles_per_shard = math.ceil((((num_heads * seq_len) / num_cores) / num_slices) / 32) - mm_activations_height_shard_spec = [tiles_per_shard * 32, 2 * 32] - mm_output_height_shard_spec = [tiles_per_shard * 32, seq_len] - - heads_per_slice = num_heads // num_slices - for i in range(num_slices): - slice = ttnn.interleaved_to_sharded_partial( - reference_query_layer, - grid_size, - mm_activations_height_shard_spec, - num_slices, - i, - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=2, - per_core_M=tiles_per_shard, - per_core_N=seq_len // 32, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - - k_slice = ttnn.slice( - reference_key_layer_transposed, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), 64, seq_len), - memory_config=l1_interleaved_memory_config, - ) - mm_slice = ttnn.matmul( - slice, - k_slice, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - k_slice.deallocate() - slice.deallocate() - - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=mm_output_height_shard_spec[0] // 32, - block_w=mm_output_height_shard_spec[1] // 32, - ) - - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - program_config = ttnn.MatmulMultiCoreReuseMultiCast1DProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - per_core_M=tiles_per_shard, - per_core_N=2, - out_subblock_h=1, - out_subblock_w=1, - fuse_batch=True, - fused_activation=None, - mcast_in0=False, - ) - v_slice = ttnn.slice( - reference_value_layer, - (0, (i * heads_per_slice), 0, 0), - (1, (i * heads_per_slice) + (heads_per_slice), seq_len, 64), - memory_config=l1_interleaved_memory_config, - ) - mm_slice = ttnn.matmul( - mm_slice, - v_slice, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_slice.deallocate() - - ttnn.sharded_to_interleaved_partial( - mm_slice, - mm_out, - num_slices, - i, - memory_config=dram_interleaved_memory_config, - ) - - mm_slice.deallocate() - - return - - mm_out_torch = tt2torch_tensor(mm_out) - - attn_weights = ttnn.matmul( - reference_query_layer, reference_key_layer_transposed, memory_config=dram_interleaved_memory_config - ) - attn_weights = ttnn.softmax_in_place(attn_weights) - attn_weights = ttnn.matmul(attn_weights, reference_value_layer, memory_config=dram_interleaved_memory_config) - - attn_weights_torch = tt2torch_tensor(attn_weights) - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [4096, 1024, 256, 64]) -@pytest.mark.parametrize("kv_len", [96]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -@pytest.mark.parametrize("reshard_for_softmax", [True, False]) -def test_cross_attnention( - device, - seq_len, - kv_len, - num_heads, - data_format, - reshard_for_softmax, - function_level_defaults, -): - if seq_len == 64 and reshard_for_softmax: - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - grid_size = (8, 2) - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, kv_len] - value_layer_shape = [1, num_heads, kv_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=l1_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - q_sharded = ttnn.interleaved_to_sharded( - reference_query_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=2, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=kv_len // 32, - ) - print(program_config) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_slice = ttnn.matmul( - q_sharded, - reference_key_layer_transposed, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - q_sharded.deallocate() - - if reshard_for_softmax: - height_per_core = num_heads * seq_len // 64 - orig_mem_config = mm_slice.memory_config() - if seq_len == 1024: - mm_slice = ttnn.sharded_to_interleaved(mm_slice, dram_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 8), - [height_per_core, kv_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - else: - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, kv_len], ttnn.ShardOrientation.COL_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=32, - block_w=3, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.reshard(mm_slice, orig_mem_config) - - else: - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=seq_len // 32, - block_w=kv_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - v_sharded = ttnn.interleaved_to_sharded( - reference_value_layer, - grid_size, - [num_heads * kv_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=kv_len // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=2, - ) - mm_slice = ttnn.matmul( - mm_slice, - v_sharded, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_sharded.deallocate() - - mm_out_torch = tt2torch_tensor(mm_slice) - - attn_weights_torch = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch = torch.nn.functional.softmax(attn_weights_torch, dim=-1) - attn_weights_torch = attn_weights_torch @ torch_value_layer - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -# Test matmul attention sequence with InterleavedToShardedPartialOp -@skip_for_grayskull() -@pytest.mark.parametrize("seq_len", [1024, 256, 64]) -@pytest.mark.parametrize("num_heads", [16]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -@pytest.mark.parametrize("reshard_for_softmax", [True, False]) -def test_attention( - device, - seq_len, - num_heads, - data_format, - reshard_for_softmax, - function_level_defaults, -): - if (seq_len == 64 or seq_len == 1024) and reshard_for_softmax: - pytest.skip() - compute_grid_size = device.compute_with_storage_grid_size() - grid_size = (2, 8) - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - query_layer_shape = [1, num_heads, seq_len, 64] - key_layer_transposed_shape = [1, num_heads, 64, seq_len] - value_layer_shape = [1, num_heads, seq_len, 64] - output_shape = [1, num_heads, seq_len, 64] - - torch_query_layer = torch.randn(query_layer_shape).bfloat16().float() - torch_key_layer_transposed = torch.randn(key_layer_transposed_shape).bfloat16().float() - torch_value_layer = torch.randn(value_layer_shape).bfloat16().float() - torch_output = torch.randn(output_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - reference_query_layer = torch2tt_tensor( - torch_query_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_key_layer_transposed = torch2tt_tensor( - torch_key_layer_transposed, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - reference_value_layer = torch2tt_tensor( - torch_value_layer, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - q_sharded = ttnn.interleaved_to_sharded( - reference_query_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - M = num_heads * seq_len - K = 64 - N = seq_len - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=K // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=M // num_cores // 32, - per_core_N=N // 32, - ) - print(program_config) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_slice = ttnn.matmul( - q_sharded, - reference_key_layer_transposed, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - q_sharded.deallocate() - - if reshard_for_softmax: - height_per_core = num_heads * seq_len // 64 - orig_mem_config = mm_slice.memory_config() - if seq_len == 1024: - mm_slice = ttnn.sharded_to_interleaved(mm_slice, l1_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 8), - [height_per_core, seq_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=height_per_core // 32, - block_w=seq_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.sharded_to_interleaved(mm_slice, l1_interleaved_memory_config) - mm_slice = ttnn.interleaved_to_sharded( - mm_slice, - (8, 2), - [num_heads * seq_len // 16, seq_len], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.COL_MAJOR, - ) - - else: - output_shard_grid = ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 7))}) - output_shard_spec = ttnn.ShardSpec( - output_shard_grid, [height_per_core, seq_len], ttnn.ShardOrientation.COL_MAJOR, False - ) - output_mem_config = ttnn.MemoryConfig( - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1, output_shard_spec - ) - mm_slice = ttnn.reshard( - mm_slice, - output_mem_config, - ) - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=(8, 8), - subblock_w=1, - block_h=height_per_core // 32, - block_w=seq_len // 32, - ) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - mm_slice = ttnn.reshard(mm_slice, orig_mem_config) - else: - softmax_program_config = ttnn.SoftmaxShardedMultiCoreProgramConfig( - compute_with_storage_grid_size=grid_size, - subblock_w=1, - block_h=seq_len // 32, - block_w=seq_len // 32, - ) - print(softmax_program_config) - mm_slice = ttnn.softmax_in_place(mm_slice, program_config=softmax_program_config) - - v_sharded = ttnn.interleaved_to_sharded( - reference_value_layer, - grid_size, - [num_heads * seq_len // num_cores, 64], - ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - program_config = ttnn.MatmulMultiCoreReuseProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=seq_len // 32, - out_subblock_h=1, - out_subblock_w=1, - per_core_M=num_heads * seq_len // num_cores // 32, - per_core_N=2, - ) - print(program_config) - mm_slice = ttnn.matmul( - mm_slice, - v_sharded, - program_config=program_config, - memory_config=height_sharded_memory_config, - dtype=data_format, - compute_kernel_config=compute_kernel_config, - ) - v_sharded.deallocate() - - mm_out_torch = tt2torch_tensor(mm_slice) - - attn_weights_torch = torch_query_layer @ torch_key_layer_transposed - attn_weights_torch = torch.nn.functional.softmax(attn_weights_torch, dim=-1) - attn_weights_torch = attn_weights_torch @ torch_value_layer - - passing, output = comp_pcc(mm_out_torch, attn_weights_torch) - - print(output) - assert passing - - -@skip_for_grayskull() -@pytest.mark.parametrize("size", [4096, 1024, 256, 64]) -@pytest.mark.parametrize("is_qkv", [1, 2, 3]) -@pytest.mark.parametrize("data_format", [ttnn.bfloat8_b]) -def test_q_and_kv( - device, - size, - data_format, - is_qkv, - function_level_defaults, -): - # Test matmul attention sequence with InterleavedToShardedPartialOp - sizes = {4096: [1, 8192, 320, 512], 1024: [1, 2048, 640, 768], 256: [1, 512, 1280, 1280], 64: [1, 128, 1280, 1280]} - grid_sizes = {4096: (5, 8), 1024: (5, 8), 256: (8, 8), 64: (8, 4)} - B, M, K, N = sizes[size] - N = N * is_qkv - grid_size = grid_sizes[size] - compute_grid_size = device.compute_with_storage_grid_size() - num_cores = grid_size[0] * grid_size[1] - if num_cores > (compute_grid_size.x * compute_grid_size.y): - pytest.skip(f"Need {num_cores} cores to run this test but core grid is {compute_grid_size}") - - in_0_shape = [1, B, M, K] - in_1_shape = [1, B, K, N] - in_2_shape = [1, B, 192, K] - in_3_shape = [1, B, K, 2 * N] - - in_0_torch = torch.randn(in_0_shape).bfloat16().float() - in_1_torch = torch.randn(in_1_shape).bfloat16().float() - in_2_torch = torch.randn(in_2_shape).bfloat16().float() - in_3_torch = torch.randn(in_3_shape).bfloat16().float() - - dram_interleaved_memory_config = ttnn.DRAM_MEMORY_CONFIG - l1_interleaved_memory_config = ttnn.L1_MEMORY_CONFIG - - height_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - block_sharded_memory_config = ttnn.MemoryConfig( - memory_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, buffer_type=ttnn.BufferType.L1 - ) - - # compare output to regular case - in_0 = torch2tt_tensor( - in_0_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_1 = torch2tt_tensor( - in_1_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_2 = torch2tt_tensor( - in_2_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - in_3 = torch2tt_tensor( - in_3_torch, - device, - tt_memory_config=dram_interleaved_memory_config, - tt_dtype=data_format, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - passing = True - output = None - - in_0_sharded = ttnn.interleaved_to_sharded( - in_0, - grid_size, - [M // grid_size[1], K // grid_size[0]], - ttnn.TensorMemoryLayout.BLOCK_SHARDED, - ttnn.ShardOrientation.ROW_MAJOR, - ) - M, K = in_0.shape[-2], in_0.shape[-1] - N = in_1.shape[-1] - in0_block_h, in0_block_w, out_subblock_h, out_subblock_w, out_block_h, out_block_w = determine_blocking( - M, K, N, grid_size - ) - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=in0_block_w, - out_subblock_h=out_subblock_h, - out_subblock_w=out_subblock_w, - per_core_M=out_block_h, - per_core_N=out_block_w, - transpose_mcast=False, - fused_activation=None, - ) - - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - mm = ttnn.matmul( - in_0_sharded if size != 4096 else in_0, - in_1, - program_config=program_config, - memory_config=block_sharded_memory_config, - dtype=ttnn.bfloat8_b, - compute_kernel_config=compute_kernel_config, - ) - in_0_sharded.deallocate() - - M, K, N = in_2.shape[-2], in_2.shape[-1], in_3.shape[-1] - in0_block_h = M // grid_size[1] // 32 - in0_block_w = K // grid_size[0] // 32 - out_block_h = math.ceil(M / grid_size[1] / 32) - out_block_w = math.ceil(N / grid_size[0] / 32) - out_subblock_h, out_subblock_w = determine_largest_subblock_size(out_block_h, out_block_w) - program_config = ttnn.MatmulMultiCoreReuseMultiCastProgramConfig( - compute_with_storage_grid_size=grid_size, - in0_block_w=in0_block_w, - out_subblock_h=out_subblock_h, - out_subblock_w=out_subblock_w, - per_core_M=out_block_h, - per_core_N=out_block_w, - transpose_mcast=False, - fused_activation=None, - ) - compute_kernel_config = ttnn.WormholeComputeKernelConfig( - math_fidelity=ttnn.MathFidelity.LoFi, - math_approx_mode=True, - fp32_dest_acc_en=False, - packer_l1_acc=False, - ) - - mm_out_torch = tt2torch_tensor(mm) - - out_torch = in_0_torch @ in_1_torch - - passing, output = comp_pcc(mm_out_torch, out_torch) - - print(output) - assert passing From c12bf8e053ab06d744f1e05d8685ec02527a96a6 Mon Sep 17 00:00:00 2001 From: Shwetank Singh Date: Tue, 3 Dec 2024 11:01:35 +0000 Subject: [PATCH 09/59] #14179: Formating conv2d results. Co-authored-by: Lewis Panos --- .../demos/convnet_mnist/tt/convnet_mnist.py | 8 +++++-- models/demos/segformer/tt/common.py | 4 +++- .../tt/ttnn_functional_squeezebert.py | 4 +++- ..._functional_resnet50_large_new_conv_api.py | 24 ++++++++++++++----- .../ttnn_functional_resnet50_new_conv_api.py | 20 ++++++++++++---- ...functional_resnet50_xlarge_new_conv_api.py | 24 ++++++++++++++----- ...ctional_resnet50_xlarge_new_conv_api_24.py | 24 ++++++++++++++----- ...unctional_resnet50_xxlarge_new_conv_api.py | 24 ++++++++++++++----- models/demos/vgg/tt/ttnn_vgg.py | 8 +++++-- models/demos/wormhole/mamba/tt/mamba_conv.py | 4 +++- .../ttnn_functional_downsample_2d_new_conv.py | 4 +++- .../ttnn_functional_resnetblock2d_new_conv.py | 24 +++++++++++++------ ...ttnn_functional_transformer_2d_new_conv.py | 12 ++++++---- ...tional_unet_2d_condition_model_new_conv.py | 9 +++++-- .../ttnn_functional_upsample_2d_new_conv.py | 4 +++- models/demos/yolov4/ttnn/common.py | 4 +++- .../functional_unet/tt/unet_shallow_ttnn.py | 4 +++- .../sweep_utils/conv2d_common.py | 8 +++++-- .../ttnn/unit_tests/operations/test_conv1d.py | 4 +++- .../unit_tests/operations/test_new_conv2d.py | 16 +++++++++---- .../operations/test_prepare_conv_weights.py | 2 +- .../operations/test_small_resnet50_block.py | 20 ++++++++++++---- ttnn/ttnn/operations/conv1d.py | 16 ++++++++----- ttnn/ttnn/operations/conv2d.py | 19 ++++++++++++++- 24 files changed, 216 insertions(+), 74 deletions(-) diff --git a/models/demos/convnet_mnist/tt/convnet_mnist.py b/models/demos/convnet_mnist/tt/convnet_mnist.py index f0443e938c4c..1d9ac8acba0a 100644 --- a/models/demos/convnet_mnist/tt/convnet_mnist.py +++ b/models/demos/convnet_mnist/tt/convnet_mnist.py @@ -35,7 +35,7 @@ def convnet_mnist( packer_l1_acc=False, ) x = ttnn.to_layout(input_tensor, layout=ttnn.ROW_MAJOR_LAYOUT) - [x, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + x = ttnn.conv2d( input_tensor=x, weight_tensor=parameters.conv1.weight, in_channels=1, @@ -53,6 +53,8 @@ def convnet_mnist( conv_op_cache={}, debug=True, groups=1, + return_output_dim=False, + return_weights_and_bias=False, ) x = ttnn.relu(x) @@ -79,7 +81,7 @@ def convnet_mnist( dilation=[1, 1], ) - [x, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + x, [out_height, out_width] = ttnn.conv2d( input_tensor=x, weight_tensor=parameters.conv2.weight, in_channels=32, @@ -96,6 +98,8 @@ def convnet_mnist( conv_op_cache={}, debug=False, groups=1, + return_output_dim=True, + return_weights_and_bias=False, ) x = ttnn.relu(x) diff --git a/models/demos/segformer/tt/common.py b/models/demos/segformer/tt/common.py index 10a4509bc4ed..d777116d2321 100644 --- a/models/demos/segformer/tt/common.py +++ b/models/demos/segformer/tt/common.py @@ -60,7 +60,7 @@ def __call__(self, device, input_tensor): if self.act_block_h is not None: conv_config.act_block_h_override = self.act_block_h - [output_tensor, _out_height, _out_width, self.weights, self.bias] = ttnn.conv2d( + [output_tensor, [_out_height, _out_width]] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.weights, bias_tensor=self.bias, @@ -76,6 +76,8 @@ def __call__(self, device, input_tensor): conv_config=conv_config, compute_config=compute_config, groups=self.groups, + return_output_dim=True, + return_weights_and_bias=False, ) return output_tensor, _out_height, _out_width diff --git a/models/demos/squeezebert/tt/ttnn_functional_squeezebert.py b/models/demos/squeezebert/tt/ttnn_functional_squeezebert.py index 7355663e951d..b0fa2c604313 100644 --- a/models/demos/squeezebert/tt/ttnn_functional_squeezebert.py +++ b/models/demos/squeezebert/tt/ttnn_functional_squeezebert.py @@ -90,7 +90,7 @@ def ttnn_conv1d( packer_l1_acc=packer_l1_acc, ) - [tt_output_tensor_on_device, out_length, weights_device, bias_device] = ttnn.Conv1d( + [tt_output_tensor_on_device, out_length, [weights_device, bias_device]] = ttnn.Conv1d( input_tensor=tt_input_tensor, weight_tensor=weights, in_channels=tt_input_tensor.shape[-1], @@ -107,6 +107,8 @@ def ttnn_conv1d( conv_op_cache={}, debug=debug, groups=groups, + return_output_dim=True, + return_weights_and_bias=True, ) tt_output_tensor_on_device = ttnn.squeeze(tt_output_tensor_on_device, 0) diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py index 123e6a1cef48..ce49bfbfa516 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py @@ -167,7 +167,7 @@ def run_downsample_if_req( shard_layout = ( ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED ) - ds_out, _, _, self.ds_conv_weight_tensor, self.ds_conv_bias_tensor = ttnn.conv2d( + ds_out, [self.ds_conv_weight_tensor, self.ds_conv_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.ds_conv_weight_tensor, in_channels=self.ds_conv_input_channels, @@ -192,6 +192,8 @@ def run_downsample_if_req( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) ttnn.deallocate(x) ds_out = ttnn.reallocate(ds_out) @@ -216,7 +218,7 @@ def __call__( # conv1 is 1x1 conv # print("Running conv1") module_input_height = input_height - out, input_height, input_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -242,6 +244,8 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) act_block_h_override = 0 @@ -281,7 +285,7 @@ def __call__( ) # if ds_out_mem_config and ds_out_mem_config != ttnn.get_memory_config(out): # out = ttnn.to_memory_config(out, ds_out_mem_config) - out, input_height, input_width, self.conv2_weight_tensor, self.conv2_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv2_weight_tensor, self.conv2_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv2_weight_tensor, in_channels=self.conv2_input_channels, @@ -310,11 +314,13 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # conv3 is 1x1 conv # print("Running conv3") - out, _, _, self.conv3_weight_tensor, self.conv3_bias_tensor = ttnn.conv2d( + out, [self.conv3_weight_tensor, self.conv3_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv3_weight_tensor, in_channels=self.conv3_input_channels, @@ -339,6 +345,8 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_weights_and_bias=True, + return_output_dim=False, ) if not self.run_downsample_before_conv2: @@ -554,7 +562,7 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config ) - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -579,6 +587,8 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 @@ -867,7 +877,7 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config ) - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -892,6 +902,8 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py index 107e562a73d0..a8944b654c37 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py @@ -160,7 +160,7 @@ def run_downsample_if_req( ): if self.downsample: logger.debug(f"Running downsample") - ds_out, _, _, self.ds_conv_weight_tensor, self.ds_conv_bias_tensor = ttnn.conv2d( + ds_out, [self.ds_conv_weight_tensor, self.ds_conv_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.ds_conv_weight_tensor, in_channels=self.ds_conv_input_channels, @@ -198,6 +198,8 @@ def run_downsample_if_req( packer_l1_acc=packer_l1_accum_enabled, ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) ttnn.deallocate(x) ds_out = ttnn.reallocate(ds_out) @@ -229,7 +231,7 @@ def __call__( # conv1 is 1x1 conv logger.debug(f"Running conv1") module_input_height = input_height - out, input_height, input_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -258,6 +260,8 @@ def __call__( packer_l1_acc=packer_l1_acc, ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) act_block_h_override = 0 @@ -313,7 +317,7 @@ def __call__( reallocate_halo_output = batch_size == 20 logger.debug(f"Running conv2") - out, input_height, input_width, self.conv2_weight_tensor, self.conv2_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv2_weight_tensor, self.conv2_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv2_weight_tensor, in_channels=self.conv2_input_channels, @@ -349,6 +353,8 @@ def __call__( packer_l1_acc=packer_l1_acc, ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) logger.debug( @@ -367,7 +373,7 @@ def __call__( # conv3 is 1x1 conv logger.debug(f"Running conv3") - out, _, _, self.conv3_weight_tensor, self.conv3_bias_tensor = ttnn.conv2d( + out, [self.conv3_weight_tensor, self.conv3_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv3_weight_tensor, in_channels=self.conv3_input_channels, @@ -395,6 +401,8 @@ def __call__( packer_l1_acc=packer_l1_acc, ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) if not run_downsample_before_conv2: @@ -734,7 +742,7 @@ def run(self, input_tensor, device, ops_parallel_config, conv_op_cache={}) -> tt logger.debug(f"==== first conv") # first conv - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=fold_output_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -750,6 +758,8 @@ def run(self, input_tensor, device, ops_parallel_config, conv_op_cache={}) -> tt conv_config=self.conv1_config, compute_config=self.conv1_compute_config, conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 if self.batch_size == 20: diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py index 90853eb06bc4..a5427f1fc879 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py @@ -162,7 +162,7 @@ def run_downsample_if_req( height_sharding=None, ): if self.downsample: - ds_out, _, _, self.ds_conv_weight_tensor, self.ds_conv_bias_tensor = ttnn.conv2d( + ds_out, [self.ds_conv_weight_tensor, self.ds_conv_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.ds_conv_weight_tensor, in_channels=self.ds_conv_input_channels, @@ -189,6 +189,8 @@ def run_downsample_if_req( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) ttnn.deallocate(x) ds_out = ttnn.reallocate(ds_out) @@ -211,7 +213,7 @@ def __call__( # conv1 is 1x1 conv # print("Running conv1") module_input_height = input_height - out, input_height, input_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -237,6 +239,8 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) act_block_h_override = 0 @@ -274,7 +278,7 @@ def __call__( # self.conv1_input_channels == 256 and # self.downsample ) - out, input_height, input_width, self.conv2_weight_tensor, self.conv2_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv2_weight_tensor, self.conv2_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv2_weight_tensor, in_channels=self.conv2_input_channels, @@ -303,11 +307,13 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # conv3 is 1x1 conv # print("Running conv3") - out, _, _, self.conv3_weight_tensor, self.conv3_bias_tensor = ttnn.conv2d( + out, [self.conv3_weight_tensor, self.conv3_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv3_weight_tensor, in_channels=self.conv3_input_channels, @@ -332,6 +338,8 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) if not self.run_downsample_before_conv2: @@ -524,7 +532,7 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt elif batch_size == 20: act_block_h_override = 640 - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -549,6 +557,8 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 @@ -829,7 +839,7 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c else: act_block_h_override = 0 - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -854,6 +864,8 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py index 77894c783187..6bc5013bbf6e 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py @@ -164,7 +164,7 @@ def run_downsample_if_req( height_sharding=None, ): if self.downsample: - ds_out, _, _, self.ds_conv_weight_tensor, self.ds_conv_bias_tensor = ttnn.conv2d( + ds_out, [self.ds_conv_weight_tensor, self.ds_conv_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.ds_conv_weight_tensor, in_channels=self.ds_conv_input_channels, @@ -191,6 +191,8 @@ def run_downsample_if_req( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) ttnn.deallocate(x) ds_out = ttnn.reallocate(ds_out) @@ -213,7 +215,7 @@ def __call__( # conv1 is 1x1 conv # print("Running conv1") module_input_height = input_height - out, input_height, input_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -239,6 +241,8 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) act_block_h_override = 0 @@ -277,7 +281,7 @@ def __call__( logger.info( f"Running conv2 with reallocate_halo_output={reallocate_halo_output}, input_height={input_height}, conv2_output_channels={self.conv2_output_channels}" ) - out, input_height, input_width, self.conv2_weight_tensor, self.conv2_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv2_weight_tensor, self.conv2_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv2_weight_tensor, in_channels=self.conv2_input_channels, @@ -306,11 +310,13 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # conv3 is 1x1 conv # print("Running conv3") - out, _, _, self.conv3_weight_tensor, self.conv3_bias_tensor = ttnn.conv2d( + out, self.conv3_weight_tensor, self.conv3_bias_tensor = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv3_weight_tensor, in_channels=self.conv3_input_channels, @@ -335,6 +341,8 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) if not self.run_downsample_before_conv2: @@ -549,7 +557,7 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt input_tensor, device=device, memory_config=self.grayskull_conv1_input_memory_config ) - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -574,6 +582,8 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 @@ -882,7 +892,7 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c elif batch_size == 20: act_block_h_override = 640 - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -907,6 +917,8 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py index e1cba745a8c4..d59a6c752380 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py @@ -163,7 +163,7 @@ def run_downsample_if_req( height_sharding=None, ): if self.downsample: - ds_out, _, _, self.ds_conv_weight_tensor, self.ds_conv_bias_tensor = ttnn.conv2d( + ds_out, [self.ds_conv_weight_tensor, self.ds_conv_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.ds_conv_weight_tensor, in_channels=self.ds_conv_input_channels, @@ -191,6 +191,8 @@ def run_downsample_if_req( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) ttnn.deallocate(x) ds_out = ttnn.reallocate(ds_out) @@ -218,7 +220,7 @@ def __call__( # conv1 is 1x1 conv # print("Running conv1") module_input_height = input_height - out, input_height, input_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -245,6 +247,8 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) if is_wormhole_b0(): @@ -325,7 +329,7 @@ def __call__( # self.conv1_input_channels == 256 and # self.downsample ) - out, input_height, input_width, self.conv2_weight_tensor, self.conv2_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv2_weight_tensor, self.conv2_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv2_weight_tensor, in_channels=self.conv2_input_channels, @@ -355,11 +359,13 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # conv3 is 1x1 conv # print("Running conv3") - out, _, _, self.conv3_weight_tensor, self.conv3_bias_tensor = ttnn.conv2d( + out, [self.conv3_weight_tensor, self.conv3_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv3_weight_tensor, in_channels=self.conv3_input_channels, @@ -385,6 +391,8 @@ def __call__( device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) if not self.run_downsample_before_conv2: @@ -589,7 +597,7 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt else: act_block_h_override = 0 - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -615,6 +623,8 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 @@ -925,7 +935,7 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c elif batch_size == 1: act_block_h_override = 256 - x, x_height, x_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + x, [x_height, x_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -950,6 +960,8 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # Relu is fused with conv1 diff --git a/models/demos/vgg/tt/ttnn_vgg.py b/models/demos/vgg/tt/ttnn_vgg.py index ace558cfa111..82f5dd1c03d5 100644 --- a/models/demos/vgg/tt/ttnn_vgg.py +++ b/models/demos/vgg/tt/ttnn_vgg.py @@ -116,7 +116,7 @@ def ttnn_vgg16( tt_bias = parameters.features[conv_feature_ids[iter_conv_id]].bias # Call ttnn.conv conv_op_cache = {} - [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d( input_tensor=tt_x, weight_tensor=tt_weight, in_channels=conv_ttnn_params[iter_conv_id][0], @@ -132,6 +132,8 @@ def ttnn_vgg16( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) tt_x = ttnn.from_device(tt_output_tensor_on_device) ttnn.deallocate(tt_output_tensor_on_device) @@ -243,7 +245,7 @@ def ttnn_vgg11( # Call ttnn.conv conv_op_cache = {} - [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d( input_tensor=tt_x, weight_tensor=tt_weight, in_channels=conv_ttnn_params_2[iter_conv_id][0], @@ -259,6 +261,8 @@ def ttnn_vgg11( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) tt_x = ttnn.from_device(tt_output_tensor_on_device) ttnn.deallocate(tt_output_tensor_on_device) diff --git a/models/demos/wormhole/mamba/tt/mamba_conv.py b/models/demos/wormhole/mamba/tt/mamba_conv.py index 799ea950392a..c7e897ad4847 100644 --- a/models/demos/wormhole/mamba/tt/mamba_conv.py +++ b/models/demos/wormhole/mamba/tt/mamba_conv.py @@ -90,7 +90,7 @@ def __call__(self, input_tensor): input_tensor_splits = self.prepare_input(input_tensor) output_tensor_splits = [] for i in range(self.config.channels_split_factor): - [tt_output_tensor_on_device, out_length, weights_device, _] = ttnn.Conv1d( + [tt_output_tensor_on_device, out_length, [weights_device, _]] = ttnn.Conv1d( input_tensor=input_tensor_splits[i], weight_tensor=self.tt_weight_tensor_splits[i], in_channels=self.config.input_channels // self.config.channels_split_factor, @@ -107,6 +107,8 @@ def __call__(self, input_tensor): conv_op_cache={}, debug=False, groups=self.config.groups // self.config.channels_split_factor, + return_output_dim=True, + return_weights_and_bias=True, ) self.tt_weight_tensor_splits[i] = weights_device output_tensor_splits.append(ttnn.sharded_to_interleaved(tt_output_tensor_on_device)) diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py index 1d1478f94d46..570d2457f1ae 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py @@ -146,7 +146,7 @@ def __call__( if self.conv_config_override and "act_block_h" in self.conv_config_override: conv_config.act_block_h_override = self.conv_config_override["act_block_h"] - [hidden_states, _out_height, _out_width, self.conv_weights, self.conv_bias] = ttnn.conv2d( + [hidden_states, [self.conv_weights, self.conv_bias]] = ttnn.conv2d( input_tensor=hidden_states, in_channels=self.in_channels, out_channels=self.out_channels, @@ -162,6 +162,8 @@ def __call__( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=False, + return_weights_and_bias=True, ) # hidden_states = run_ttnn_conv_with_pre_and_post_tensor_formatting( # self.device, diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py index 6d94f60975e3..45024d9c9d99 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py @@ -474,7 +474,7 @@ def __call__( ) if self.conv1_config_override and "act_block_h" in self.conv2_config_override: conv_config.act_block_h_override = self.conv1_config_override["act_block_h"] - [hidden_states, _out_height, _out_width, self.conv1s_weights[0], self.conv1s_bias[0]] = ttnn.conv2d( + [hidden_states, [self.conv1s_weights[0], self.conv1s_bias[0]]] = ttnn.conv2d( input_tensor=hidden_states, weight_tensor=self.conv1s_weights[0], in_channels=self.conv1_in_channels, @@ -490,6 +490,8 @@ def __call__( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=False, + return_weights_and_bias=True, ) else: @@ -551,10 +553,8 @@ def __call__( [ split_hidden_states[i], - _out_height, - _out_width, - self.conv1s_weights[i], - self.conv1s_bias[i], + [_out_height, _out_width], + [self.conv1s_weights[i], self.conv1s_bias[i]], ] = ttnn.conv2d( input_tensor=split_hidden_states[i], weight_tensor=self.conv1s_weights[i], @@ -571,6 +571,8 @@ def __call__( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=True, + return_weights_and_bias=True, ) if i != 0: split_hidden_states[i] = ttnn.add( @@ -680,7 +682,7 @@ def __call__( ) if self.conv2_config_override and "act_block_h" in self.conv2_config_override: conv_config.act_block_h_override = self.conv2_config_override["act_block_h"] - [hidden_states, _out_height, _out_width, self.conv2_weights, self.conv2_bias] = ttnn.conv2d( + [hidden_states, [_out_height, _out_width], [self.conv2_weights, self.conv2_bias]] = ttnn.conv2d( input_tensor=hidden_states, weight_tensor=self.conv2_weights, bias_tensor=self.conv2_bias, @@ -696,6 +698,8 @@ def __call__( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=True, + return_weights_and_bias=True, ) use_in_shortcut = in_channels != out_channels if use_in_shortcut is None else use_in_shortcut @@ -726,7 +730,11 @@ def __call__( fp32_dest_acc_en=True, packer_l1_acc=False, ) - [input_tensor, _out_height, _out_width, self.conv_shortcut_weights, self.conv_shortcut_bias] = ttnn.conv2d( + [ + input_tensor, + [_out_height, _out_width], + [self.conv_shortcut_weights, self.conv_shortcut_bias], + ] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv_shortcut_weights, in_channels=self.conv_shortcut_in_channels, @@ -742,6 +750,8 @@ def __call__( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=True, + return_weights_and_bias=True, ) if ttnn.get_memory_config(input_tensor) != ttnn.get_memory_config(hidden_states): diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py index 3a856dce04ee..e89a957357e1 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py @@ -252,7 +252,7 @@ def __call__( math_fidelity=ttnn.MathFidelity.LoFi, fp32_dest_acc_en=self.compute_kernel_config.fp32_dest_acc_en, ) - [hidden_states, _out_height, _out_width, self.proj_in_conv_weights, self.proj_in_conv_bias] = ttnn.conv2d( + [hidden_states, [self.proj_in_conv_weights, self.proj_in_conv_bias]] = ttnn.conv2d( input_tensor=hidden_states, in_channels=self.proj_in_in_channels, out_channels=self.proj_in_out_channels, @@ -268,6 +268,8 @@ def __call__( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=False, + return_weights_and_bias=True, ) inner_dim = hidden_states.shape[-1] @@ -297,10 +299,8 @@ def __call__( # hidden_states = ttnn.to_memory_config(hidden_states, self.proj_out.conv.input_sharded_memory_config) [ hidden_states, - _out_height, - _out_width, - self.proj_out_conv_weights, - self.proj_out_conv_bias, + [_out_height, _out_width], + [self.proj_out_conv_weights, self.proj_out_conv_bias], ] = ttnn.conv2d( input_tensor=hidden_states, in_channels=self.proj_out_in_channels, @@ -316,6 +316,8 @@ def __call__( bias_tensor=self.proj_out_conv_bias, conv_config=conv_config, conv_op_cache=conv_cache, + return_output_dim=True, + return_weights_and_bias=True, ) if output_bfloat16: diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py index a3525c385982..1003c1efc4e4 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py @@ -396,7 +396,8 @@ def __call__( fp32_dest_acc_en=True, packer_l1_acc=False, ) - [sample, _out_height, _out_width, self.conv_in_weights, self.conv_in_bias] = ttnn.conv2d( + + [sample, [self.conv_in_weights, self.conv_in_bias]] = ttnn.conv2d( input_tensor=sample, weight_tensor=self.conv_in_weights, bias_tensor=self.conv_in_bias, @@ -412,6 +413,8 @@ def __call__( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=False, + return_weights_and_bias=True, ) sample = ttnn.reallocate(sample) # TODO: Test remove @@ -663,7 +666,7 @@ def __call__( fp32_dest_acc_en=True, packer_l1_acc=False, ) - [sample, _out_height, _out_width, self.conv_out_weights, self.conv_out_bias] = ttnn.conv2d( + [sample, [self.conv_out_weights, self.conv_out_bias]] = ttnn.conv2d( input_tensor=sample, in_channels=self.conv_out_in_channels, out_channels=self.conv_out_out_channels, @@ -679,6 +682,8 @@ def __call__( conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=False, + return_weights_and_bias=True, ) sample = ttnn.to_memory_config(sample, ttnn.L1_MEMORY_CONFIG) sample = ttnn.clone(sample, memory_config=ttnn.L1_MEMORY_CONFIG, dtype=ttnn.bfloat16) diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py index 54056a715269..52e9fb5c9139 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py @@ -106,7 +106,7 @@ def __call__(self, input, in_channels, out_channels): ) if self.conv_config_override and "act_block_h" in self.conv_config_override: conv_config.act_block_h_override = self.conv_config_override["act_block_h"] - [tt_out, _out_height, _out_width, self.conv_weight_tensor, self.conv_bias_tensor] = ttnn.conv2d( + [tt_out, [self.conv_weight_tensor, self.conv_bias_tensor]] = ttnn.conv2d( input_tensor=tt_out, in_channels=self.conv_in_channels, out_channels=self.conv_out_channels, @@ -122,5 +122,7 @@ def __call__(self, input, in_channels, out_channels): conv_config=conv_config, compute_config=compute_config, conv_op_cache=conv_cache, + return_output_dim=False, + return_weights_and_bias=True, ) return tt_out diff --git a/models/demos/yolov4/ttnn/common.py b/models/demos/yolov4/ttnn/common.py index 9d3b154aaf42..1579f9112f92 100644 --- a/models/demos/yolov4/ttnn/common.py +++ b/models/demos/yolov4/ttnn/common.py @@ -102,7 +102,7 @@ def __call__(self, device, input_tensor): if self.act_block_h is not None: conv_config.act_block_h_override = self.act_block_h - [output_tensor, _out_height, _out_width, self.weights, self.bias] = ttnn.conv2d( + output_tensor, [self.weights, self.bias] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.weights, bias_tensor=self.bias, @@ -117,5 +117,7 @@ def __call__(self, device, input_tensor): input_width=self.input_params[2], conv_config=conv_config, compute_config=compute_config, + return_output_dim=False, + return_weights_and_bias=True, ) return output_tensor diff --git a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py index 95eff2e07764..8a5157d51dc3 100644 --- a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py +++ b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py @@ -147,7 +147,7 @@ def __init__( self.bias = ttnn.from_torch(bias, dtype=ttnn.float32, mesh_mapper=mesh_mapper) def __call__(self, x): - x, _, _, self.weight, self.bias = ttnn.conv2d( + x, [self.weight, self.bias] = ttnn.conv2d( input_tensor=x, weight_tensor=self.weight, bias_tensor=self.bias, @@ -164,6 +164,8 @@ def __call__(self, x): compute_config=self.compute_config, conv_op_cache=self.cache, groups=2, + return_output_dim=False, + return_weights_and_bias=True, ) return x diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py index 2dc1709bdbdf..c7509247213e 100644 --- a/tests/sweep_framework/sweep_utils/conv2d_common.py +++ b/tests/sweep_framework/sweep_utils/conv2d_common.py @@ -139,7 +139,7 @@ def run_full( {ttnn.CoreRange(core_grid[0], core_grid[1]), ttnn.CoreRange(core_grid[2], core_grid[3])} ) start_time = start_measuring_time() - [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, in_channels=input_channels, @@ -156,6 +156,8 @@ def run_full( conv_config=conv_config, compute_config=compute_config, groups=groups, + return_output_dim=True, + return_weights_and_bias=True, ) tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) @@ -223,7 +225,7 @@ def run_short( tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) start_time = start_measuring_time() - [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, in_channels=input_channels, @@ -238,6 +240,8 @@ def run_short( input_height=input_height, input_width=input_width, groups=groups, + return_output_dim=True, + return_weights_and_bias=True, ) tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) diff --git a/tests/ttnn/unit_tests/operations/test_conv1d.py b/tests/ttnn/unit_tests/operations/test_conv1d.py index a7ca4c9c30c4..7013ef6b2db2 100644 --- a/tests/ttnn/unit_tests/operations/test_conv1d.py +++ b/tests/ttnn/unit_tests/operations/test_conv1d.py @@ -107,7 +107,7 @@ def run_conv( conv_config.override_sharding_config = True print("Setting num_cores_nhw to 98") - [tt_output_tensor_on_device, out_length, weights_device, bias_device] = ttnn.Conv1d( + [tt_output_tensor_on_device, out_length, [weights_device, bias_device]] = ttnn.Conv1d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, in_channels=input_channels, @@ -124,6 +124,8 @@ def run_conv( conv_op_cache=reader_patterns_cache, debug=debug, groups=groups, + return_output_dim=True, + return_weights_and_bias=True, ) tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 25d4b0bc00f0..d41c5deae4f7 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -165,7 +165,7 @@ def run_conv( conv_config.override_sharding_config = True print("Setting num_cores_nhw to 98") - [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, in_channels=input_channels, @@ -185,6 +185,8 @@ def run_conv( debug=debug, groups=groups, memory_config=memory_config, + return_weights_and_bias=True, + return_output_dim=True, ) tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) @@ -313,7 +315,7 @@ def run_conv_with_split( tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) # tt_input_tensor_on_device = convs[i].copy_input_to_device(tt_input_tensor) # tt_output_tensor_on_device = convs[i](tt_input_tensor_on_device) - [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, in_channels=split_input_channels, @@ -329,6 +331,8 @@ def run_conv_with_split( conv_config=conv_config, compute_config=compute_config, conv_op_cache=reader_patterns_cache, + return_output_dim=True, + return_weights_and_bias=True, ) tt_conv_output_tensor = ttnn.from_device(tt_output_tensor_on_device) torch_conv_output_tensor = ttnn.to_torch(tt_conv_output_tensor) @@ -649,7 +653,7 @@ def test_conv_ws( fp32_dest_acc_en=fp32_accum, packer_l1_acc=packer_l1_acc, ) - [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( + [tt_output_tensor_on_device, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, in_channels=input_channels, @@ -667,6 +671,8 @@ def test_conv_ws( conv_op_cache=reader_patterns_cache, debug=debug, groups=groups, + return_output_dim=True, + return_weights_and_bias=True, ) tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) @@ -2742,7 +2748,7 @@ def test_shallow_conv_with_tiled_input(device): tt_input = ttnn.reshape(tt_input, (1, 1, batch_size * img_h * img_w, in_channels)) - tt_out, out_height, out_width, _, _ = ttnn.conv2d( + [tt_out, [out_height, out_width], [weights_device, bias_device]] = ttnn.conv2d( input_tensor=tt_input, weight_tensor=tt_kernel, in_channels=in_channels, @@ -2761,6 +2767,8 @@ def test_shallow_conv_with_tiled_input(device): device.arch(), ), memory_config=ttnn.DRAM_MEMORY_CONFIG, + return_output_dim=True, + return_weights_and_bias=True, ) tt_output_tensor = ttnn.from_device(tt_out) diff --git a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py index 5a59200a178c..09cafdd0aca3 100644 --- a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py +++ b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py @@ -178,7 +178,7 @@ def test_prepare_conv_weights( tt_weight_tensor_formatted = ttnn.to_device(tt_weight_tensor_formatted, device) tt_bias_tensor_formatted = ttnn.to_device(tt_bias_tensor_formatted, device) if has_bias else None (k := next(iter(conv_kwargs)), conv_kwargs.pop(k)) ##removing 1st element from dict - tt_output_tensor_on_device, _, _, _, _ = ttnn.conv2d( + tt_output_tensor_on_device = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor_formatted, bias_tensor=tt_bias_tensor_formatted, diff --git a/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py b/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py index bf233351d1f4..638251f20da1 100644 --- a/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py +++ b/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py @@ -103,7 +103,7 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac # logger.info("This module input shape - ", self.module_input_shape) # conv1 is 1x1 conv # print("Running conv1") - x, input_height, input_width, self.identity_conv_weight_tensor, _ = ttnn.conv2d( + x, [input_height, input_width], [self.identity_conv_weight_tensor, _] = ttnn.conv2d( input_tensor=x, weight_tensor=self.identity_conv_weight_tensor, in_channels=self.conv1_input_channels, @@ -124,9 +124,11 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac math_fidelity=self.model_config["MATH_FIDELITY"], ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) - out, input_height, input_width, self.conv1_weight_tensor, self.conv1_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv1_weight_tensor, self.conv1_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.conv1_weight_tensor, in_channels=self.conv1_input_channels, @@ -149,10 +151,12 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac math_fidelity=self.model_config["MATH_FIDELITY"], ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) if self.downsample: - ds_out, _, _, self.ds_conv_weight_tensor, self.ds_conv_bias_tensor = ttnn.conv2d( + ds_out, [self.ds_conv_weight_tensor, self.ds_conv_bias_tensor] = ttnn.conv2d( input_tensor=x, weight_tensor=self.ds_conv_weight_tensor, in_channels=self.ds_conv_input_channels, @@ -174,13 +178,15 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac math_fidelity=self.model_config["MATH_FIDELITY"], ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) ttnn.deallocate(x) else: ds_out = x # print("Running conv2") - out, input_height, input_width, self.conv2_weight_tensor, self.conv2_bias_tensor = ttnn.conv2d( + out, [input_height, input_width], [self.conv2_weight_tensor, self.conv2_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv2_weight_tensor, in_channels=self.conv2_input_channels, @@ -203,11 +209,13 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac math_fidelity=self.model_config["MATH_FIDELITY"], ), conv_op_cache=conv_op_cache, + return_output_dim=True, + return_weights_and_bias=True, ) # conv3 is 1x1 conv # print("Running conv3") - out, _, _, self.conv3_weight_tensor, self.conv3_bias_tensor = ttnn.conv2d( + out, [self.conv3_weight_tensor, self.conv3_bias_tensor] = ttnn.conv2d( input_tensor=out, weight_tensor=self.conv3_weight_tensor, in_channels=self.conv3_input_channels, @@ -229,6 +237,8 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac math_fidelity=self.model_config["MATH_FIDELITY"], ), conv_op_cache=conv_op_cache, + return_output_dim=False, + return_weights_and_bias=True, ) # underscore version is in_place = True diff --git a/ttnn/ttnn/operations/conv1d.py b/ttnn/ttnn/operations/conv1d.py index b899f01e3b33..ef8187cbd786 100644 --- a/ttnn/ttnn/operations/conv1d.py +++ b/ttnn/ttnn/operations/conv1d.py @@ -31,6 +31,8 @@ def Conv1d( compute_config: ttnn.DeviceComputeKernelConfig = None, conv_op_cache={}, # basic conv object caching in python needed for intermediate refactoring. Not needed after full op refactoring in C++. debug=False, + return_output_dim=False, + return_weights_and_bias=False, ) -> Tuple[ttnn.Tensor, int, int, ttnn.Tensor, ttnn.Tensor]: # Reshape the input and weight tensors to 4D for conv2d operation # Should be no-op as input_tensor is in RM layout @@ -64,12 +66,14 @@ def Conv1d( compute_config=compute_config, ) - return ( - output_tensor_new, - output_length_new, - weight_tensor_on_dev_new, - bias_tensor_on_dev_new, - ) + if return_output_dim and return_weights_and_bias: + return output_tensor_new, output_length_new, [weight_tensor_on_dev_new, bias_tensor_on_dev_new] + elif return_weights_and_bias: + return output_tensor_new, [weight_tensor_on_dev_new, bias_tensor_on_dev_new] + elif return_output_dim: + return output_tensor_new, output_length_new + else: + return output_tensor_new __all__ = [] diff --git a/ttnn/ttnn/operations/conv2d.py b/ttnn/ttnn/operations/conv2d.py index 2f0fa3ee7362..84079a566535 100644 --- a/ttnn/ttnn/operations/conv2d.py +++ b/ttnn/ttnn/operations/conv2d.py @@ -180,8 +180,16 @@ def conv2d( memory_config: ttnn.MemoryConfig = None, # memory config overrides by user conv_op_cache={}, # basic conv object caching in python needed for intermediate refactoring. Not needed after full op refactoring in C++. debug=False, # ignored + return_output_dim=False, + return_weights_and_bias=False, ) -> Tuple[ttnn.Tensor, int, int, ttnn.Tensor, ttnn.Tensor]: - return ttnn._ttnn.operations.conv.conv2d( + ( + conv_output, + output_height, + output_width, + prepared_device_weight, + prepared_device_bias, + ) = ttnn._ttnn.operations.conv.conv2d( input_tensor=input_tensor, weight_tensor=weight_tensor, device=device, @@ -201,5 +209,14 @@ def conv2d( memory_config=memory_config, ) + if return_output_dim and return_weights_and_bias: + return conv_output, [output_height, output_width], [prepared_device_weight, prepared_device_bias] + elif return_weights_and_bias: + return conv_output, [prepared_device_weight, prepared_device_bias] + elif return_output_dim: + return conv_output, [output_height, output_width] + else: + return conv_output + __all__ = [] From e4ee8e768e4e5d948868c5ca276e563e2bbebd23 Mon Sep 17 00:00:00 2001 From: Allan Liu Date: Tue, 3 Dec 2024 15:47:57 +0000 Subject: [PATCH 10/59] Remove deprecated addresses from eth mem map --- tt_metal/hw/inc/blackhole/eth_l1_address_map.h | 8 +------- tt_metal/hw/inc/grayskull/eth_l1_address_map.h | 3 --- tt_metal/hw/inc/wormhole/eth_l1_address_map.h | 8 +------- tt_metal/impl/device/device.cpp | 2 +- 4 files changed, 3 insertions(+), 18 deletions(-) diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h index 05d071dfdb43..6cfe5eadaf8f 100644 --- a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h @@ -58,10 +58,7 @@ struct address_map { static constexpr std::int32_t ERISC_APP_ROUTING_INFO_BASE = TILE_HEADER_BUFFER_BASE; static constexpr std::int32_t ERISC_APP_SYNC_INFO_BASE = ERISC_APP_ROUTING_INFO_BASE + ERISC_APP_ROUTING_INFO_SIZE; - static constexpr uint32_t ISSUE_CQ_CB_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE; - static constexpr uint32_t COMPLETION_CQ_CB_BASE = ISSUE_CQ_CB_BASE + 7 * L1_ALIGNMENT; - - static constexpr std::int32_t ERISC_MEM_MAILBOX_BASE = COMPLETION_CQ_CB_BASE + 7 * L1_ALIGNMENT; + static constexpr std::uint32_t ERISC_MEM_MAILBOX_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE; static constexpr std::uint32_t ERISC_MEM_MAILBOX_SIZE = 3344; static constexpr std::uint32_t ERISC_MEM_MAILBOX_END = ERISC_MEM_MAILBOX_BASE + ERISC_MEM_MAILBOX_SIZE; @@ -80,9 +77,6 @@ struct address_map { static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = L1_EPOCH_Q_BASE + 4; - // BIDIR Tunneling Kernel Space - static constexpr std::int32_t ERISC_L1_TUNNEL_BUFFER_SIZE = ERISC_L1_UNRESERVED_SIZE / 2; - template struct TAssertEquality { static_assert(A == B, "Not equal"); diff --git a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h index 26332938fcbc..edec6f63c307 100644 --- a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h +++ b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h @@ -27,8 +27,6 @@ struct address_map { static constexpr std::int32_t ERISC_FIRMWARE_SIZE = 16; static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = 0; - static constexpr std::uint32_t ISSUE_CQ_CB_BASE = 0; - static constexpr std::uint32_t COMPLETION_CQ_CB_BASE = 0; static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = 0; static constexpr std::uint32_t FW_VERSION_ADDR = 0; @@ -36,7 +34,6 @@ struct address_map { static constexpr std::int32_t MAX_L1_LOADING_SIZE = 1; static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = 0; - static constexpr std::int32_t ERISC_L1_TUNNEL_BUFFER_SIZE = 0; static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = 0; static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = 0; diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index 39d41601bef9..3c87023d8555 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -56,10 +56,7 @@ struct address_map { static constexpr std::int32_t ERISC_APP_ROUTING_INFO_BASE = TILE_HEADER_BUFFER_BASE; static constexpr std::int32_t ERISC_APP_SYNC_INFO_BASE = ERISC_APP_ROUTING_INFO_BASE + ERISC_APP_ROUTING_INFO_SIZE; - static constexpr uint32_t ISSUE_CQ_CB_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE; - static constexpr uint32_t COMPLETION_CQ_CB_BASE = ISSUE_CQ_CB_BASE + 7 * L1_ALIGNMENT; - - static constexpr std::int32_t ERISC_MEM_MAILBOX_BASE = COMPLETION_CQ_CB_BASE + 7 * L1_ALIGNMENT; + static constexpr std::int32_t ERISC_MEM_MAILBOX_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE; static constexpr std::uint32_t ERISC_MEM_MAILBOX_SIZE = 3232; static constexpr std::uint32_t ERISC_MEM_MAILBOX_END = ERISC_MEM_MAILBOX_BASE + ERISC_MEM_MAILBOX_SIZE; @@ -78,9 +75,6 @@ struct address_map { static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = L1_EPOCH_Q_BASE + 4; - // BIDIR Tunneling Kernel Space - static constexpr std::int32_t ERISC_L1_TUNNEL_BUFFER_SIZE = ERISC_L1_UNRESERVED_SIZE / 2; - template struct TAssertEquality { static_assert(A == B, "Not equal"); diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index c5a3c75660e3..2f8196d7c05e 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -779,7 +779,7 @@ void Device::clear_l1_state() { // These L1 ranges are restricted becase UMD base routing FW uses L1 below FIRMWARE_BASE and // between TILE_HEADER_BUFFER_BASE to COMMAND_Q_BASE std::vector zero_vec_above_tile_header_buffer( - (eth_l1_mem::address_map::ISSUE_CQ_CB_BASE - eth_l1_mem::address_map::TILE_HEADER_BUFFER_BASE) / sizeof(uint32_t), + (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::TILE_HEADER_BUFFER_BASE) / sizeof(uint32_t), 0); // Clear erisc sync info From edce56346faab8f0d638035fa902512f2606c6fc Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Thu, 5 Dec 2024 17:00:07 +0000 Subject: [PATCH 11/59] #0: Remove incorrect assertion on cb page sizes being a multiple of 4 (should have been 16) and update circular buffer init/usage to support any page size for cbs that aren't used by compute. Compute CBs still require 16B multiple for address/sizes --- .../test_CircularBuffer_allocation.cpp | 13 +++++------- .../test_CircularBuffer_creation.cpp | 20 ++++--------------- .../dispatch_program/test_EnqueueProgram.cpp | 14 ++++++------- .../dispatch/random_program_fixture.hpp | 6 ++++-- .../command_queue/random_program.cpp | 2 +- tt_metal/hw/inc/circular_buffer.h | 6 ++++++ tt_metal/hw/inc/circular_buffer_constants.h | 4 ++-- tt_metal/hw/inc/circular_buffer_init.h | 6 +++--- tt_metal/hw/inc/dataflow_api.h | 4 ++-- tt_metal/hw/inc/debug/dprint_tile.h | 9 ++++----- tt_metal/hw/inc/remote_circular_buffer_api.h | 8 +++++--- .../impl/buffers/circular_buffer_types.cpp | 4 ---- tt_metal/impl/dispatch/command_queue.cpp | 12 +++++------ tt_metal/impl/program/program.cpp | 2 +- tt_metal/tt_metal.cpp | 9 +++------ 15 files changed, 53 insertions(+), 66 deletions(-) diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp index 624226798ecc..8c3321858408 100644 --- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp @@ -41,8 +41,7 @@ void validate_cb_address( for (const auto& [buffer_index, expected_address] : address_per_buffer_index) { auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; - EXPECT_EQ( - expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, cb_config_vector.at(base_index)); + EXPECT_EQ(expected_address, cb_config_vector.at(base_index)); } } } @@ -358,9 +357,8 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) { for (const auto& [buffer_index, expected_address] : address_per_buffer_index) { auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; - EXPECT_EQ( - expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config_vector.at(base_index)); // address validation + EXPECT_EQ(expected_address, + cb_config_vector.at(base_index)); // address validation EXPECT_EQ( num_pages_per_buffer_index.at(buffer_index), cb_config_vector.at(base_index + 2)); // num pages validation @@ -391,9 +389,8 @@ TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) { for (const auto& [buffer_index, expected_address] : address_per_buffer_index) { auto base_index = UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG * buffer_index; - EXPECT_EQ( - expected_address >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config_vector.at(base_index)); // address validation + EXPECT_EQ(expected_address, + cb_config_vector.at(base_index)); // address validation EXPECT_EQ( num_pages_per_buffer_index.at(buffer_index), cb_config_vector.at(base_index + 2)); // num pages validation diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp index b9d1c369973f..1278c8abb7d1 100644 --- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp @@ -65,22 +65,10 @@ TEST_F(DeviceFixture, TensixTestCreateCircularBufferAtValidIndices) { uint32_t l1_unreserved_base = devices_.at(0)->get_base_allocator_addr(HalMemType::L1); std::map> golden_cb_config = { - {0, - {l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.num_pages}}, - {2, - {l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.num_pages}}, - {16, - {l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.num_pages}}, - {24, - {l1_unreserved_base >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES, - cb_config.num_pages}}}; + {0, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}, + {2, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}, + {16, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}, + {24, {l1_unreserved_base, cb_config.page_size, cb_config.num_pages}}}; std::map data_format_spec = { {0, cb_config.data_format}, {2, cb_config.data_format}, diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp index 356f7766820d..5dd7eea0042f 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp @@ -101,7 +101,7 @@ bool cb_config_successful(Device* device, Program& program, const DummyProgramMu tt::tt_metal::detail::ReadFromDeviceL1( device, core_coord, - program.get_sem_base_addr(device, core_coord, CoreType::WORKER), + program.get_cb_base_addr(device, core_coord, CoreType::WORKER), cb_config_buffer_size, cb_config_vector); @@ -110,8 +110,8 @@ bool cb_config_successful(Device* device, Program& program, const DummyProgramMu const uint32_t index = program_config.cb_config_vector[i].cb_id * sizeof(uint32_t); const uint32_t cb_num_pages = program_config.cb_config_vector[i].num_pages; const uint32_t cb_size = cb_num_pages * program_config.cb_config_vector[i].page_size; - const bool addr_match = cb_config_vector.at(index) == ((cb_addr) >> 4); - const bool size_match = cb_config_vector.at(index + 1) == (cb_size >> 4); + const bool addr_match = cb_config_vector.at(index) == cb_addr; + const bool size_match = cb_config_vector.at(index + 1) == cb_size; const bool num_pages_match = cb_config_vector.at(index + 2) == cb_num_pages; pass &= (addr_match and size_match and num_pages_match); @@ -860,15 +860,15 @@ TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCBSharedAddressSpace uint32_t cb_addr = device->get_base_allocator_addr(HalMemType::L1); uint32_t intermediate_index = intermediate_cb * sizeof(uint32_t); - bool addr_match_intermediate = cb_config_vector.at(intermediate_index) == ((cb_addr) >> 4); - bool size_match_intermediate = cb_config_vector.at(intermediate_index + 1) == (cb_size >> 4); + bool addr_match_intermediate = cb_config_vector.at(intermediate_index) == (cb_addr); + bool size_match_intermediate = cb_config_vector.at(intermediate_index + 1) == (cb_size); bool num_pages_match_intermediate = cb_config_vector.at(intermediate_index + 2) == num_tiles; bool pass_intermediate = (addr_match_intermediate and size_match_intermediate and num_pages_match_intermediate); EXPECT_TRUE(pass_intermediate); uint32_t out_index = out_cb * sizeof(uint32_t); - bool addr_match_out = cb_config_vector.at(out_index) == ((cb_addr) >> 4); - bool size_match_out = cb_config_vector.at(out_index + 1) == (cb_size >> 4); + bool addr_match_out = cb_config_vector.at(out_index) == cb_addr; + bool size_match_out = cb_config_vector.at(out_index + 1) == cb_size; bool num_pages_match_out = cb_config_vector.at(out_index + 2) == num_tiles; bool pass_out = (addr_match_out and size_match_out and num_pages_match_out); EXPECT_TRUE(pass_out); diff --git a/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp index 55c9d3d40a46..fc87c2b58df1 100644 --- a/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp +++ b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp @@ -9,6 +9,7 @@ #include "llrt/hal.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/hw/inc/circular_buffer_constants.h" #include "tt_metal/impl/kernels/kernel.hpp" #include "tt_metal/common/tt_backend_api_types.hpp" #include "dispatch_test_utils.hpp" @@ -141,13 +142,14 @@ class RandomProgramFixture : virtual public CommandQueueSingleCardProgramFixture const uint32_t num_cbs = this->generate_random_num(min, max); std::vector cb_page_sizes; for (uint32_t cb_idx = 0; cb_idx < num_cbs; cb_idx++) { - const uint32_t cb_page_size = this->generate_random_num(MIN_CB_PAGE_SIZE, MAX_CB_PAGE_SIZE, 16); + const uint32_t cb_page_size = + this->generate_random_num(MIN_CB_PAGE_SIZE, MAX_CB_PAGE_SIZE, CIRCULAR_BUFFER_COMPUTE_WORD_SIZE); const uint32_t cb_total_size = this->generate_random_num(MIN_CB_TOTAL_SIZE, MAX_CB_TOTAL_SIZE, cb_page_size); CircularBufferConfig config = CircularBufferConfig(cb_total_size, {{cb_idx, tt::DataFormat::Float16_b}}) .set_page_size(cb_idx, cb_page_size); CreateCircularBuffer(program, cores, config); - cb_page_sizes.push_back(cb_page_size / 16); + cb_page_sizes.push_back(cb_page_size); } return cb_page_sizes; } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp index f3e64ae1b3ec..3d5ee14d71be 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp @@ -45,7 +45,7 @@ void kernel_main() { (uint32_t tt_l1_ptr*)(kernel_config_base + mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.local_cb_offset); uint32_t cb_val = reinterpret_cast(cb_l1_base + i * 4)[3]; - uint32_t expected = ((i + 1) * page_size) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + uint32_t expected = ((i + 1) * page_size); if (cb_val != expected) { DPRINT << "Problem with CB idx: " << i << " Expected: " << expected << " Got: " << cb_val << ENDL(); while (true); // Purposefully hang the kernel if CBs did not arrive correctly diff --git a/tt_metal/hw/inc/circular_buffer.h b/tt_metal/hw/inc/circular_buffer.h index 35942bb49d72..68a2b3436cc9 100644 --- a/tt_metal/hw/inc/circular_buffer.h +++ b/tt_metal/hw/inc/circular_buffer.h @@ -103,3 +103,9 @@ FORCE_INLINE RemoteSenderCBInterface& get_remote_sender_cb_interface(uint32_t cb FORCE_INLINE RemoteReceiverCBInterface& get_remote_receiver_cb_interface(uint32_t cb_id) { return cb_interface[cb_id].remote_receiver_cb_interface; } + +#if defined(COMPILE_FOR_TRISC) +constexpr uint32_t cb_addr_shift = CIRCULAR_BUFFER_COMPUTE_ADDR_SHIFT; +#else +constexpr uint32_t cb_addr_shift = 0; +#endif diff --git a/tt_metal/hw/inc/circular_buffer_constants.h b/tt_metal/hw/inc/circular_buffer_constants.h index 8ff3fda763b7..3b80937753ce 100644 --- a/tt_metal/hw/inc/circular_buffer_constants.h +++ b/tt_metal/hw/inc/circular_buffer_constants.h @@ -9,5 +9,5 @@ constexpr static std::uint32_t NUM_CIRCULAR_BUFFERS = 32; constexpr static std::uint32_t UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG = 4; constexpr static std::uint32_t UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG = 2; -constexpr static std::uint32_t CIRCULAR_BUFFER_WORD_SIZE_BYTES = 16; -constexpr static std::uint32_t CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES = 4; +constexpr static std::uint32_t CIRCULAR_BUFFER_COMPUTE_WORD_SIZE = 16; +constexpr static std::uint32_t CIRCULAR_BUFFER_COMPUTE_ADDR_SHIFT = 4; diff --git a/tt_metal/hw/inc/circular_buffer_init.h b/tt_metal/hw/inc/circular_buffer_init.h index 29f2af20cb4d..b44025080222 100644 --- a/tt_metal/hw/inc/circular_buffer_init.h +++ b/tt_metal/hw/inc/circular_buffer_init.h @@ -25,10 +25,10 @@ inline void setup_local_cb_read_write_interfaces( for (uint32_t cb_id = start_cb_index; cb_id < max_cb_index; cb_id++) { // NOTE: fifo_addr, fifo_size and fifo_limit in 16B words! - uint32_t fifo_addr = circular_buffer_config_addr[0]; - uint32_t fifo_size = circular_buffer_config_addr[1]; + uint32_t fifo_addr = circular_buffer_config_addr[0] >> cb_addr_shift; + uint32_t fifo_size = circular_buffer_config_addr[1] >> cb_addr_shift; uint32_t fifo_num_pages = circular_buffer_config_addr[2]; - uint32_t fifo_page_size = circular_buffer_config_addr[3]; + uint32_t fifo_page_size = circular_buffer_config_addr[3] >> cb_addr_shift; uint32_t fifo_limit = fifo_addr + fifo_size; LocalCBInterface& local_interface = get_local_cb_interface(cb_id); diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 59f6fc28963e..c6877db3e0a1 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -418,7 +418,7 @@ constexpr inline DataFormat get_dataformat(const std::int32_t operand) { FORCE_INLINE uint32_t get_write_ptr(uint32_t operand) { // return byte address (fifo_wr_ptr is 16B address) - uint32_t wr_ptr_bytes = get_local_cb_interface(operand).fifo_wr_ptr << 4; + uint32_t wr_ptr_bytes = get_local_cb_interface(operand).fifo_wr_ptr; return wr_ptr_bytes; } @@ -437,7 +437,7 @@ uint32_t get_write_ptr(uint32_t operand) { FORCE_INLINE uint32_t get_read_ptr(uint32_t operand) { // return byte address (fifo_rd_ptr is 16B address) - uint32_t rd_ptr_bytes = get_local_cb_interface(operand).fifo_rd_ptr << 4; + uint32_t rd_ptr_bytes = get_local_cb_interface(operand).fifo_rd_ptr; return rd_ptr_bytes; } diff --git a/tt_metal/hw/inc/debug/dprint_tile.h b/tt_metal/hw/inc/debug/dprint_tile.h index 85aa838d8e55..1e737f66cf12 100644 --- a/tt_metal/hw/inc/debug/dprint_tile.h +++ b/tt_metal/hw/inc/debug/dprint_tile.h @@ -17,13 +17,12 @@ #endif // Macros for printing circular buffer internals -#define CB_RD_PTR(id) (get_local_cb_interface(id).fifo_rd_ptr << 4) // only valid in unpacker thread -#define CB_RD_LIM(id) ((get_local_cb_interface(id).fifo_limit_plus_1 - 1) << 4) -#define CB_RD_SZ(id) (get_local_cb_interface(id).fifo_size << 4) +#define CB_RD_PTR(id) (get_local_cb_interface(id).fifo_rd_ptr << cb_addr_shift) // only valid in unpacker thread +#define CB_RD_SZ(id) (get_local_cb_interface(id).fifo_size << cb_addr_shift) -#define CB_WR_PTR(id) (get_local_cb_interface(id).fifo_wr_ptr << 4) // only valid in packer thread +#define CB_WR_PTR(id) (get_local_cb_interface(id).fifo_wr_ptr << cb_addr_shift) // only valid in packer thread #define CB_PAGE_COUNT(id) (get_local_cb_interface(id).fifo_num_pages) -#define CB_PAGE_SIZE(id) (get_local_cb_interface(id).fifo_page_size << 4) +#define CB_PAGE_SIZE(id) (get_local_cb_interface(id).fifo_page_size << cb_addr_shift) // // Slices/samples elements of a tile 'itile' from cb using a given numpy style slice object SliceRange. diff --git a/tt_metal/hw/inc/remote_circular_buffer_api.h b/tt_metal/hw/inc/remote_circular_buffer_api.h index 712458d62b14..044e3705f939 100644 --- a/tt_metal/hw/inc/remote_circular_buffer_api.h +++ b/tt_metal/hw/inc/remote_circular_buffer_api.h @@ -4,6 +4,7 @@ #pragma once +#include "tt_metal/hw/inc/circular_buffer.h" #include "tt_metal/hw/inc/debug/assert.h" #include "utils/utils.h" #ifndef COMPILE_FOR_TRISC @@ -242,11 +243,12 @@ FORCE_INLINE void align_local_cbs_to_remote_cb( // We assert that the offset of sender and receiver common attributes are the same // so we can use either interface here const RemoteReceiverCBInterface& remote_cb = get_remote_receiver_cb_interface(remote_cb_index); - uint32_t fifo_limit = remote_cb.fifo_limit_page_aligned >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; - uint32_t fifo_size = fifo_limit - (remote_cb.fifo_start_addr >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES); - uint32_t fifo_ptr = remote_cb.fifo_rd_ptr >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + uint32_t fifo_limit = remote_cb.fifo_limit_page_aligned >> cb_addr_shift; + uint32_t fifo_size = fifo_limit - (remote_cb.fifo_start_addr >> cb_addr_shift); + uint32_t fifo_ptr = remote_cb.fifo_rd_ptr >> cb_addr_shift; for (uint32_t i = 0; i < num_local_cbs; i++) { LocalCBInterface& local_cb = get_local_cb_interface(local_cb_indices[i]); + ASSERT(fifo_size % local_cb.fifo_page_size == 0); uint32_t fifo_num_pages = fifo_size / local_cb.fifo_page_size; local_cb.fifo_limit = fifo_limit; local_cb.fifo_size = fifo_size; diff --git a/tt_metal/impl/buffers/circular_buffer_types.cpp b/tt_metal/impl/buffers/circular_buffer_types.cpp index a14738e0edb7..7877c2648206 100644 --- a/tt_metal/impl/buffers/circular_buffer_types.cpp +++ b/tt_metal/impl/buffers/circular_buffer_types.cpp @@ -43,10 +43,6 @@ CircularBufferConfig& CircularBufferConfig::set_page_size(uint8_t buffer_index, if (this->total_size_ % page_size != 0) { TT_THROW("Total circular buffer size {} B must be divisible by page size {} B", this->total_size_, page_size); } - // TODO: Should use CIRCULAR_BUFFER_WORD_SIZE_BYTES here - if (page_size % sizeof(uint32_t) != 0) { - TT_THROW("Page size must be divisible by sizeof(uint32_t) because buffers holds uint32_t values"); - } this->page_sizes_[buffer_index] = page_size; return *this; diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 7b90f313dbe1..e0ef8b96cfc4 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -901,8 +901,8 @@ void EnqueueProgramCommand::assemble_device_commands( circular_buffers_on_corerange.size()); for (const std::shared_ptr& cb : circular_buffers_on_corerange) { program_command_sequence.circular_buffers_on_core_ranges[i].emplace_back(cb); - const uint32_t cb_address = cb->address() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; - const uint32_t cb_size = cb->size() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + const uint32_t cb_address = cb->address(); + const uint32_t cb_size = cb->size(); for (const auto& buffer_index : cb->local_buffer_indices()) { // 1 cmd for all 32 buffer indices, populate with real data for specified indices // cb config payload @@ -910,7 +910,7 @@ void EnqueueProgramCommand::assemble_device_commands( cb_config_payload[base_index] = cb_address; cb_config_payload[base_index + 1] = cb_size; cb_config_payload[base_index + 2] = cb->num_pages(buffer_index); - cb_config_payload[base_index + 3] = cb->page_size(buffer_index) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + cb_config_payload[base_index + 3] = cb->page_size(buffer_index); max_index = std::max(max_index, base_index + UINT32_WORDS_PER_LOCAL_CIRCULAR_BUFFER_CONFIG); } for (const auto& buffer_index : cb->remote_buffer_indices()) { @@ -1363,8 +1363,8 @@ void EnqueueProgramCommand::update_device_commands( for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) { uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i]; for (const std::shared_ptr& cb : cbs_on_core_range) { - const uint32_t cb_address = cb->address() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; - const uint32_t cb_size = cb->size() >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + const uint32_t cb_address = cb->address(); + const uint32_t cb_size = cb->size(); for (const auto& buffer_index : cb->local_buffer_indices()) { // 1 cmd for all 32 buffer indices, populate with real data for specified indices @@ -1373,7 +1373,7 @@ void EnqueueProgramCommand::update_device_commands( cb_config_payload[base_index] = cb_address; cb_config_payload[base_index + 1] = cb_size; cb_config_payload[base_index + 2] = cb->num_pages(buffer_index); - cb_config_payload[base_index + 3] = cb->page_size(buffer_index) >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + cb_config_payload[base_index + 3] = cb->page_size(buffer_index); } for (const auto& buffer_index : cb->remote_buffer_indices()) { const uint32_t base_index = remote_offset_index + (NUM_CIRCULAR_BUFFERS - 1 - buffer_index) * diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 09f90b8017b9..216ffcae5b32 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -776,7 +776,7 @@ void detail::Program_::allocate_circular_buffers(const Device *device) { } } } - + computed_addr = align(computed_addr, device->get_allocator_alignment()); for (const CoreRange &core_range : circular_buffer->core_ranges().ranges()) { for (CircularBufferAllocator &cb_allocator : this->cb_allocators_) { if (cb_allocator.core_range.intersects(core_range)) { diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 38f061df9876..e59f14430cd6 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -745,13 +745,10 @@ bool ConfigureDeviceWithProgram(Device* device, Program& program, bool fd_bootlo uint32_t size_in_bytes = circular_buffer->size(); uint32_t num_pages = circular_buffer->num_pages(buffer_index); uint32_t page_size = size_in_bytes / num_pages; - circular_buffer_config_vec[base_index] = - addr_in_bytes >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; // convert to addr in 16B words - circular_buffer_config_vec[base_index + 1] = - size_in_bytes >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; // convert to addr in 16B words + circular_buffer_config_vec[base_index] = addr_in_bytes; // convert to addr in 16B words + circular_buffer_config_vec[base_index + 1] = size_in_bytes; // convert to addr in 16B words circular_buffer_config_vec[base_index + 2] = num_pages; - circular_buffer_config_vec[base_index + 3] = - page_size >> CIRCULAR_BUFFER_LOG2_WORD_SIZE_BYTES; + circular_buffer_config_vec[base_index + 3] = page_size; } for (uint32_t buffer_index : circular_buffer->remote_buffer_indices()) { uint32_t base_index = From 1b5c6763489c3ba5e0b14e4f2ea1464bb3e17f51 Mon Sep 17 00:00:00 2001 From: Mo Date: Thu, 5 Dec 2024 23:01:24 +0000 Subject: [PATCH 12/59] #14050: Quick fix on profiler macros --- tt_metal/tools/profiler/kernel_profiler.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 78844333dcdd..81d4f5a3d1ab 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -446,4 +446,8 @@ inline __attribute__((always_inline)) void recordEvent(uint16_t event_id) { #define DeviceZoneSetCounter(counter) +#define DeviceTimestampedData(data_id, data) + +#define DeviceRecordEvent(event_id) + #endif From bd4adedb3a967ef4a1d2cd9f64283527f6232e32 Mon Sep 17 00:00:00 2001 From: Mo Date: Fri, 6 Dec 2024 17:03:03 +0000 Subject: [PATCH 13/59] #14849: Profiler sub-device post proc fix and smoke test --- .../tools/profiler/test_device_profiler.py | 55 +++++++++++++++---- .../dispatch_program/test_sub_device.cpp | 2 + .../dispatch_trace/test_sub_device.cpp | 1 + tt_metal/tools/profiler/process_device_log.py | 5 +- tt_metal/tools/profiler/tt_metal_profiler.cpp | 32 +++++------ 5 files changed, 65 insertions(+), 30 deletions(-) diff --git a/tests/tt_metal/tools/profiler/test_device_profiler.py b/tests/tt_metal/tools/profiler/test_device_profiler.py index 01d71f987aa0..1c2ab3823cc5 100644 --- a/tests/tt_metal/tools/profiler/test_device_profiler.py +++ b/tests/tt_metal/tools/profiler/test_device_profiler.py @@ -7,6 +7,7 @@ import re import inspect import pytest +import subprocess import pandas as pd @@ -24,6 +25,30 @@ PROG_EXMP_DIR = "programming_examples/profiler" +def get_device_data(setupStr=""): + postProcessRun = os.system( + f"cd {PROFILER_SCRIPTS_ROOT} && " f"./process_device_log.py {setupStr} --no-artifacts --no-print-stats" + ) + + assert postProcessRun == 0, f"Log process script crashed with exit code {postProcessRun}" + + devicesData = {} + with open(f"{PROFILER_ARTIFACTS_DIR}/output/device/device_analysis_data.json", "r") as devicesDataJson: + devicesData = json.load(devicesDataJson) + + return devicesData + + +def run_gtest_profiler_test(testbin, testname): + clear_profiler_runtime_artifacts() + output = subprocess.check_output( + f"cd {TT_METAL_HOME} && {testbin} --gtest_filter={testname}", stderr=subprocess.STDOUT, shell=True + ).decode("UTF-8") + print(output) + if "SKIPPED" not in output: + get_device_data() + + def run_device_profiler_test(testName=None, setup=False, slowDispatch=False): name = inspect.stack()[1].function testCommand = f"build/{PROG_EXMP_DIR}/{name}" @@ -41,17 +66,7 @@ def run_device_profiler_test(testName=None, setup=False, slowDispatch=False): if setup: setupStr = f"-s {name}" - postProcessRun = os.system( - f"cd {PROFILER_SCRIPTS_ROOT} && " f"./process_device_log.py {setupStr} --no-artifacts --no-print-stats" - ) - - assert postProcessRun == 0, f"Log process script crashed with exit code {postProcessRun}" - - devicesData = {} - with open(f"{PROFILER_ARTIFACTS_DIR}/output/device/device_analysis_data.json", "r") as devicesDataJson: - devicesData = json.load(devicesDataJson) - - return devicesData + return get_device_data(setupStr) def get_function_name(): @@ -231,6 +246,8 @@ def test_profiler_host_device_sync(): assert freq < (reportedFreq * (1 + TOLERANCE)), f"Frequency too large on device {device}" assert freq > (reportedFreq * (1 - TOLERANCE)), f"Frequency too small on device {device}" + os.environ["TT_METAL_PROFILER_SYNC"] = "0" + def test_timestamped_events(): OP_COUNT = 2 @@ -268,3 +285,19 @@ def test_timestamped_events(): devicesData["data"]["devices"]["0"]["cores"]["DEVICE"]["riscs"]["TENSIX"]["events"]["all_events"] ) assert eventCount in REF_COUNT_DICT[ENV_VAR_ARCH_NAME], "Wrong event count" + + +def test_sub_device_profiler(): + run_gtest_profiler_test( + "./build/test/tt_metal/unit_tests_dispatch", "CommandQueueSingleCardFixture.TensixTestSubDeviceBasicPrograms" + ) + os.environ["TT_METAL_PROFILER_SYNC"] = "1" + run_gtest_profiler_test( + "./build/test/tt_metal/unit_tests_dispatch", + "CommandQueueSingleCardFixture.TensixActiveEthTestSubDeviceBasicEthPrograms", + ) + os.environ["TT_METAL_PROFILER_SYNC"] = "0" + run_gtest_profiler_test( + "./build/test/tt_metal/unit_tests_dispatch_trace", + "CommandQueueSingleCardTraceFixture.TensixTestSubDeviceTraceBasicPrograms", + ) diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp index 6016433f5565..f140433f3a9e 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp @@ -108,6 +108,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceBasicPrograms) { EnqueueProgram(device->command_queue(), incrementer_program, false); } Synchronize(device); + detail::DumpDeviceProfileResults(device); } } @@ -136,5 +137,6 @@ TEST_F(CommandQueueSingleCardFixture, TensixActiveEthTestSubDeviceBasicEthProgra EnqueueProgram(device->command_queue(), incrementer_program, false); } Synchronize(device); + detail::DumpDeviceProfileResults(device); } } diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp index 74fa8256ab80..5caff9052aaa 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp @@ -63,6 +63,7 @@ TEST_F(CommandQueueSingleCardTraceFixture, TensixTestSubDeviceTraceBasicPrograms ReplayTrace(device, device->command_queue().id(), tid_2, false); } Synchronize(device); + detail::DumpDeviceProfileResults(device); } } diff --git a/tt_metal/tools/profiler/process_device_log.py b/tt_metal/tools/profiler/process_device_log.py index 4fbba9654023..ecdd1396cbff 100755 --- a/tt_metal/tools/profiler/process_device_log.py +++ b/tt_metal/tools/profiler/process_device_log.py @@ -309,6 +309,7 @@ def get_ops(timeseries): opsDict[opID].append(ts) ordered_ops = list(opsDict.keys()) + # sort over timestamps ordered_ops.sort(key=lambda x: opsDict[x][0][1]) ops = [] @@ -327,9 +328,7 @@ def get_ops(timeseries): if (risc == "BRISC" and timerID["zone_name"] == "BRISC-FW" and timerID["type"] == "ZONE_START") or ( risc == "ERISC" and timerID["zone_name"] == "ERISC-FW" and timerID["type"] == "ZONE_START" ): - for opDuration in coresOp.values(): - assert len(opDuration) == 2, "Unexpected FW start" - + assert len(coresOp[core]) == 2, "Unexpected FW end" ops.append({"timeseries": []}) coresOp = {} elif (risc == "BRISC" and timerID["zone_name"] == "BRISC-FW" and timerID["type"] == "ZONE_END") or ( diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index d42d379b6352..e90e9caa236d 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -99,22 +99,22 @@ void syncDeviceHost( smallestHostime.emplace(device_id, 0); constexpr uint16_t sampleCount = 249; - if (sync_program == nullptr) { - sync_program = std::make_shared(); - - std::map kernel_defines = { - {"SAMPLE_COUNT", std::to_string(sampleCount)}, - }; - - tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel( - *sync_program, - "tt_metal/tools/profiler/sync/sync_kernel.cpp", - logical_core, - tt_metal::DataMovementConfig{ - .processor = tt_metal::DataMovementProcessor::RISCV_0, - .noc = tt_metal::NOC::RISCV_0_default, - .defines = kernel_defines}); - } + // TODO(MO): Always recreate a new program until subdevice + // allows using the first program generated by default manager + sync_program = std::make_shared(); + + std::map kernel_defines = { + {"SAMPLE_COUNT", std::to_string(sampleCount)}, + }; + + tt_metal::KernelHandle brisc_kernel = tt_metal::CreateKernel( + *sync_program, + "tt_metal/tools/profiler/sync/sync_kernel.cpp", + logical_core, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .defines = kernel_defines}); EnqueueProgram(device->command_queue(), *sync_program, false); From ff2250da92bc0deab710546a3a95022812cc5918 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Mon, 9 Dec 2024 15:13:30 -0500 Subject: [PATCH 14/59] #14974: Clean up creation functions in the ttnn numpy package (#15671) ### Ticket #14974 ### Problem description Creation functions like `zeros`, `ones`, `full` have a dedicated place `creation.hpp`. Currently, several unit tests rely on the redundant interfaces defined in `functions.hpp` in the numpy package. ### What's changed * Migrated the use of `numpy` versions of `zeros`, `ones`, `full`, `arange` in tests to `ttnn::{zeros,ones,full}`. * Moved the base implementation of `full` and `arange` into `creation.hpp` - the only place where they are used. * Minor simplification / refactor changes to deduplicate code. ### Checklist - [X] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12152247431) - [X] [T3K unit + frequent tests pass](https://github.com/tenstorrent/tt-metal/actions/runs/12152254447) - [X] New/Existing tests provide coverage for changes --- tests/tt_eager/ops/test_bcast_op.cpp | 18 +- tests/tt_eager/ops/test_bmm_op.cpp | 18 +- ...test_tilize_zero_padding_channels_last.cpp | 5 +- .../tensors/test_async_tensor_apis.cpp | 91 ++++---- tests/tt_eager/tensors/test_copy_and_move.cpp | 13 +- .../tt_metal/common/dispatch_fixture.hpp | 2 +- .../test_create_tensor_multi_device.cpp | 19 ++ .../unit_tests/gtests/test_async_runtime.cpp | 56 +++-- .../unit_tests/operations/test_creation.py | 33 +++ ttnn/cpp/pybind11/operations/creation.hpp | 40 ++-- ttnn/cpp/ttnn/operations/creation.hpp | 203 ++++++++++-------- ttnn/cpp/ttnn/operations/numpy/functions.hpp | 191 +--------------- ttnn/cpp/ttnn/tensor/types.hpp | 19 ++ 13 files changed, 312 insertions(+), 396 deletions(-) diff --git a/tests/tt_eager/ops/test_bcast_op.cpp b/tests/tt_eager/ops/test_bcast_op.cpp index a761c1eba516..05be3303c062 100644 --- a/tests/tt_eager/ops/test_bcast_op.cpp +++ b/tests/tt_eager/ops/test_bcast_op.cpp @@ -3,16 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 #include "tt_metal/host_api.hpp" +#include "ttnn/cpp/ttnn/operations/creation.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/data_movement/bcast/bcast.hpp" #include "common/constants.hpp" #include #include -#include -#include -#include - using namespace tt; using namespace tt_metal; using namespace constants; @@ -53,9 +50,8 @@ int main(int argc, char** argv) { } Tensor a = ttnn::numpy::random::random(input_shape_a).to(Layout::TILE).to(device); - Tensor b = ttnn::numpy::zeros({1, 1, TILE_HEIGHT, TILE_WIDTH}, DataType::BFLOAT16) - .to(Layout::TILE) - .to(device); + Tensor b = ttnn::zeros( + ttnn::Shape({1, 1, TILE_HEIGHT, TILE_WIDTH}), DataType::BFLOAT16, Layout::TILE, *device); for (auto bcast_math : magic_enum::enum_values()) { Tensor c = ttnn::bcast(0, a, b, bcast_math, bcast_dim); @@ -72,28 +68,28 @@ int main(int argc, char** argv) { { Tensor a = ttnn::numpy::random::random({1, 1, 32, 4544}).to(Layout::TILE).to(device); - Tensor b = ttnn::numpy::zeros({1, 1, 32, 4544}, DataType::BFLOAT16).to(Layout::TILE).to(device); + Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 4544}), DataType::BFLOAT16, Layout::TILE, *device); Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::H); Tensor d = c.cpu(); } { Tensor a = ttnn::numpy::random::random({1, 1, 32, 4544}).to(Layout::TILE).to(device); - Tensor b = ttnn::numpy::zeros({1, 1, 32, 4544}, DataType::BFLOAT16).to(Layout::TILE).to(device); + Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 4544}), DataType::BFLOAT16, Layout::TILE, *device); Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::ADD, ttnn::BcastOpDim::H); Tensor d = c.cpu(); } { Tensor a = ttnn::numpy::random::random({1, 71, 32, 32}).to(Layout::TILE).to(device); - Tensor b = ttnn::numpy::zeros({1, 1, 32, 32}, DataType::BFLOAT16).to(Layout::TILE).to(device); + Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 32}), DataType::BFLOAT16, Layout::TILE, *device); Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::HW); Tensor d = c.cpu(); } { Tensor a = ttnn::numpy::random::random({1, 71, 32, 64}).to(Layout::TILE).to(device); - Tensor b = ttnn::numpy::zeros({1, 1, 32, 32}, DataType::BFLOAT16).to(Layout::TILE).to(device); + Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 32}), DataType::BFLOAT16, Layout::TILE, *device); Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::HW); Tensor d = c.cpu(); } diff --git a/tests/tt_eager/ops/test_bmm_op.cpp b/tests/tt_eager/ops/test_bmm_op.cpp index c7760e673543..f769870b595f 100644 --- a/tests/tt_eager/ops/test_bmm_op.cpp +++ b/tests/tt_eager/ops/test_bmm_op.cpp @@ -3,15 +3,13 @@ // SPDX-License-Identifier: Apache-2.0 #include "tt_metal/host_api.hpp" +#include "ttnn/cpp/ttnn/operations/creation.hpp" #include "ttnn/tensor/tensor.hpp" +#include "ttnn/tensor/types.hpp" #include "ttnn/operations/matmul/device/matmul_op.hpp" #include "common/constants.hpp" #include "ttnn/operations/numpy/functions.hpp" -#include -#include -#include - using namespace tt; using namespace tt_metal; using namespace constants; @@ -37,14 +35,14 @@ int main(int argc, char** argv) { uint32_t Kt = 2; uint32_t Nt = 4; uint32_t B = 5; - tt::tt_metal::LegacyShape shapea = {B, 1, Mt * TILE_HEIGHT, Kt * TILE_WIDTH}; - tt::tt_metal::LegacyShape shapeb = {B, 1, Kt * TILE_HEIGHT, Nt * TILE_WIDTH}; - tt::tt_metal::LegacyShape shapeb1 = {1, 1, Kt * TILE_HEIGHT, Nt * TILE_WIDTH}; + ttnn::Shape shapea({B, 1, Mt * TILE_HEIGHT, Kt * TILE_WIDTH}); + ttnn::Shape shapeb({B, 1, Kt * TILE_HEIGHT, Nt * TILE_WIDTH}); + ttnn::Shape shapeb1({1, 1, Kt * TILE_HEIGHT, Nt * TILE_WIDTH}); // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::random::random(shapea).to(Layout::TILE).to(device); - Tensor b = ttnn::numpy::zeros(shapeb, DataType::BFLOAT16).to(Layout::TILE).to(device); - Tensor b1 = ttnn::numpy::zeros(shapeb1, DataType::BFLOAT16).to(Layout::TILE).to(device); + Tensor a = ttnn::numpy::random::random(shapea.value).to(Layout::TILE).to(device); + Tensor b = ttnn::zeros(shapeb, DataType::BFLOAT16, Layout::TILE, *device); + Tensor b1 = ttnn::zeros(shapeb1, DataType::BFLOAT16, Layout::TILE, *device); Tensor mm = ttnn::operations::matmul::matmul( a, diff --git a/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp b/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp index 1a0d13092adf..a94093f8ac6c 100644 --- a/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp +++ b/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp @@ -7,12 +7,12 @@ #include #include "common/constants.hpp" +#include "ttnn/cpp/ttnn/operations/creation.hpp" #include "ttnn/tensor/host_buffer/functions.hpp" #include "ttnn/tensor/host_buffer/types.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" using namespace tt; using namespace tt_metal; @@ -37,7 +37,8 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// ttnn::SimpleShape shape{1, 32, 61, 32}; // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::arange(0, shape.volume(), 1).reshape(shape).to(device); + Tensor a = ttnn::arange(/*start=*/0, /*stop=*/shape.volume(), /*step=*/1, DataType::BFLOAT16, std::ref(*device)) + .reshape(shape); Tensor b = ttnn::tilize_with_zero_padding(a); Tensor c = b.cpu(); //////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp index b762f10acf74..3ef44800178e 100644 --- a/tests/tt_eager/tensors/test_async_tensor_apis.cpp +++ b/tests/tt_eager/tensors/test_async_tensor_apis.cpp @@ -2,13 +2,11 @@ // // SPDX-License-Identifier: Apache-2.0 -#include #include -#include -#include #include "common/bfloat16.hpp" #include "common/constants.hpp" +#include "ttnn/cpp/ttnn/operations/creation.hpp" #include "ttnn/tensor/host_buffer/functions.hpp" #include "ttnn/tensor/host_buffer/types.hpp" #include "ttnn/tensor/tensor.hpp" @@ -16,16 +14,16 @@ #include "ttnn/tensor/types.hpp" #include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" #include "ttnn/operations/eltwise/binary/binary.hpp" #include "ttnn/operations/eltwise/unary/unary.hpp" -using namespace tt; -using namespace tt_metal; -using namespace constants; - +namespace tt::tt_metal { namespace { + +using ::tt::constants::TILE_HEIGHT; +using ::tt::constants::TILE_WIDTH; + uint32_t get_device_buffer_address(const Tensor& tensor) { TT_FATAL(std::holds_alternative(tensor.get_storage()), "Tensor storage is not DeviceStorage"); auto buffer = std::get(tensor.get_storage()).buffer; @@ -33,13 +31,12 @@ uint32_t get_device_buffer_address(const Tensor& tensor) { buffer->device()->push_work([&]() { result = buffer->address(); }, true); return result; } -} // namespace TEST_F(DispatchFixture, TestTensorOwnershipSanity) { // Sanity test tensor read, write and update paths with synchronous // Ensure that tensor data is copied and owned as expected Device* device = this->devices_[0]; - Tensor host_tensor = ttnn::numpy::arange(0, 32 * 32 * 4, 1); + Tensor host_tensor = ttnn::arange(/*start=*/0, /*stop=*/32 * 32 * 4, /*step=*/1, DataType::FLOAT32); Tensor readback_tensor(1); auto func = [device, host_tensor, readback_tensor]() mutable { @@ -122,18 +119,12 @@ TEST_F(DispatchFixture, TestAsyncEltwiseBinary) { for (int i = 0; i < 5; i++) { // Initialize tensors and move them to DRAM - Tensor input_tensor_a = - ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16, Layout::TILE) - .to(device); - Tensor input_tensor_b = - ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16, Layout::TILE) - .to(device); - Tensor input_tensor_c = - ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16, Layout::TILE) - .to(device); + Tensor input_tensor_a = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16, Layout::TILE, *device); + Tensor input_tensor_b = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16, Layout::TILE, *device); + Tensor input_tensor_c = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16, Layout::TILE, *device); Tensor output_tensor_device = ttnn::multiply(ttnn::add(input_tensor_a, input_tensor_b), input_tensor_c); Tensor output_tensor_device_2 = ttnn::neg(ttnn::subtract(output_tensor_device, input_tensor_c)); @@ -181,12 +172,18 @@ TEST_F(DispatchFixture, TestAsyncRefCountManager) { for (int i = 0; i < 5; i++) { // Run for multiple loops to ensure deterministic behaviour with device addresses // Initialize 2 tensors on device - Tensor tensor1 = ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16) - .to(device); - Tensor tensor2 = ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16) - .to(device); + Tensor tensor1 = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), + static_cast(i), + DataType::BFLOAT16, + /*layout=*/std::nullopt, + *device); + Tensor tensor2 = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), + static_cast(i), + DataType::BFLOAT16, + /*layout=*/std::nullopt, + *device); uint32_t tensor2_device_buf_addr = get_device_buffer_address(tensor2); // Assign tensor1 to tensor2 and ensure that ref counts are appropriately updated with the buffer for tensor2 // deallocated @@ -195,18 +192,23 @@ TEST_F(DispatchFixture, TestAsyncRefCountManager) { EXPECT_EQ(tensor1.tensor_attributes->main_thread_ref_count, 2); // To check if tensor2 is deallocated, create a third tensor on device and ensure that its address matches the // prev addr for tensor2 - Tensor tensor3 = ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16) - .to(device); + Tensor tensor3 = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), + static_cast(i), + DataType::BFLOAT16, + /*layout=*/std::nullopt, + *device); EXPECT_EQ(get_device_buffer_address(tensor3), tensor2_device_buf_addr); EXPECT_EQ(get_device_buffer_address(tensor1), get_device_buffer_address(tensor2)); } log_info(LogTest, "Testing Device tensor self-assignment through function"); for (int i = 0; i < 5; i++) { - Tensor device_tensor = - ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16) - .to(device); + Tensor device_tensor = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), + static_cast(i), + DataType::BFLOAT16, + /*layout=*/std::nullopt, + *device); uint32_t device_tensor_address = get_device_buffer_address(device_tensor); // This step will copy the tensor to a temp rval and std::move it back to the caller's instance of device_tensor // Ensure ref count and address remain unchanged @@ -217,18 +219,19 @@ TEST_F(DispatchFixture, TestAsyncRefCountManager) { log_info(LogTest, "Testing Device tensor move assignment"); for (int i = 0; i < 5; i++) { - Tensor tensor1 = ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(i), DataType::BFLOAT16) - .to(device); + Tensor tensor1 = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), + static_cast(i), + DataType::BFLOAT16, + /*layout=*/std::nullopt, + *device); Tensor tensor2 = std::move(tensor1); EXPECT_EQ(tensor2.tensor_attributes->main_thread_ref_count, 1); } log_info(LogTest, "Testing Device tensor self-assignment"); - Tensor tensor_to_self_assign = - ttnn::numpy::full( - tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast(0), DataType::BFLOAT16) - .to(device); + Tensor tensor_to_self_assign = ttnn::full( + ttnn::Shape({1, 1, 1024, 1024}), static_cast(0), DataType::BFLOAT16, /*layout=*/std::nullopt, *device); uint32_t tensor_to_self_assign_address = get_device_buffer_address(tensor_to_self_assign); tensor_to_self_assign = tensor_to_self_assign; EXPECT_EQ(tensor_to_self_assign.tensor_attributes->main_thread_ref_count, 1); @@ -255,7 +258,7 @@ TEST_F(DispatchFixture, TestTensorAsyncDataMovement) { { // host_tensor only lives in this scope - Tensor host_tensor = ttnn::numpy::arange(tensor_start, tensor_stop, 1); + Tensor host_tensor = ttnn::arange(tensor_start, tensor_stop, /*step=*/1, DataType::FLOAT32); log_info(LogTest, "Spawning worker thread"); worker = std::thread([tensor_stop, host_tensor, readback_tensor, device]() mutable { // Sleep for 3 seconds to ensure that main thread deallocates host_tensor @@ -338,3 +341,5 @@ TEST_F(DispatchFixture, TestTensorAsyncDataMovement) { EXPECT_EQ(readback_tensor.get_layout(), Layout::ROW_MAJOR); EXPECT_EQ(readback_tensor.get_shape(), ttnn::Shape(tt::tt_metal::LegacyShape({1, 1, 32, tensor_stop / 32}))); } +} // namespace +} // namespace tt::tt_metal diff --git a/tests/tt_eager/tensors/test_copy_and_move.cpp b/tests/tt_eager/tensors/test_copy_and_move.cpp index f735791ad4cb..656585f33519 100644 --- a/tests/tt_eager/tensors/test_copy_and_move.cpp +++ b/tests/tt_eager/tensors/test_copy_and_move.cpp @@ -2,12 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include - #include "common/bfloat16.hpp" #include "common/constants.hpp" +#include "ttnn/cpp/ttnn/operations/creation.hpp" #include "ttnn/tensor/host_buffer/functions.hpp" #include "ttnn/tensor/host_buffer/types.hpp" #include "ttnn/tensor/tensor.hpp" @@ -40,7 +37,7 @@ bool test_tensor_copy_semantics(Device* device) { pass &= dev_a_data == dev_a_copy_data; // host tensor updated with host tensor copy assignment - Tensor host_c = ttnn::numpy::arange(0, tt_metal::compute_volume(single_tile_shape), 1) + Tensor host_c = ttnn::arange(/*start=*/0, /*stop=*/tt_metal::compute_volume(single_tile_shape), /*step=*/1) .reshape(single_tile_shape) .to(Layout::TILE); Tensor host_c_copy = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE); @@ -58,7 +55,7 @@ bool test_tensor_copy_semantics(Device* device) { pass &= dev_a_data == host_d_copy_data; // dev tensor updated with host tensor copy assignment - Tensor host_e = ttnn::numpy::ones(single_tile_shape).to(Layout::TILE); + Tensor host_e = ttnn::ones(single_tile_shape, DataType::BFLOAT16, Layout::TILE); Tensor dev_e_copy = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device); dev_e_copy = host_e; pass &= (dev_e_copy.storage_type() == StorageType::OWNED); @@ -67,8 +64,8 @@ bool test_tensor_copy_semantics(Device* device) { pass &= host_e_data == dev_e_copy_data; // dev tensor updated with dev tensor copy assignment - Tensor dev_b = ttnn::numpy::ones(single_tile_shape).to(Layout::TILE).to(device); - Tensor dev_b_copy = ttnn::numpy::zeros(single_tile_shape).to(Layout::TILE).to(device); + Tensor dev_b = ttnn::ones(single_tile_shape, DataType::BFLOAT16, Layout::TILE, *device); + Tensor dev_b_copy = ttnn::zeros(single_tile_shape, DataType::BFLOAT16, Layout::TILE, *device); dev_b_copy = dev_b; pass &= (dev_b_copy.storage_type() == StorageType::DEVICE); auto dev_b_on_host = dev_b.cpu(); diff --git a/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp index 7656ac8c147f..57bfbcdb934d 100644 --- a/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp @@ -46,7 +46,7 @@ class DispatchFixture : public ::testing::Test { } void ReadBuffer( tt::tt_metal::Device* device, - std::shared_ptr out_buffer, + const std::shared_ptr& out_buffer, std::vector& dst_vec) { if (this->slow_dispatch_) { tt::tt_metal::detail::ReadFromBuffer(out_buffer, dst_vec); diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp index 585326afc8b1..f4279cc87535 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp @@ -170,6 +170,25 @@ TEST_P(MultiDeviceTensorCreationTest, FullLikeWithOptTensor) { EXPECT_TRUE(std::holds_alternative(distributed_tensor_config)); } +TEST_P(MultiDeviceTensorCreationTest, Arange) { + MeshDevice* mesh_device = this->mesh_device_.get(); + mesh_device->enable_async(GetParam()); + + Tensor tensor = ttnn::arange( + /*start=*/0, + /*end=*/1024, + /*step=*/1, + ttnn::DataType::BFLOAT16, + std::ref(*mesh_device)); + + EXPECT_EQ(tensor.storage_type(), StorageType::MULTI_DEVICE); + EXPECT_EQ(tensor.get_workers().size(), mesh_device->num_devices()); + EXPECT_EQ(tensor.shape(), ttnn::SimpleShape({1, 1, 1, 1024})); + + const auto distributed_tensor_config = get_distributed_tensor_config_from_tensor(tensor); + EXPECT_TRUE(std::holds_alternative(distributed_tensor_config)); +} + INSTANTIATE_TEST_SUITE_P(AllTests, MultiDeviceTensorCreationTest, ::testing::Bool()); } // namespace diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp index 7e1ab23115ef..b5495a324dbb 100644 --- a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp +++ b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "ttnn/cpp/ttnn/operations/creation.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/layout/tensor_layout.hpp" #include "ttnn_multi_command_queue_fixture.hpp" @@ -10,14 +11,13 @@ #include "ttnn/operations/moreh/moreh_sum/moreh_sum.hpp" #include "common/bfloat16.hpp" #include "ttnn/async_runtime.hpp" -#include "ttnn/operations/numpy/functions.hpp" #include "tt_metal/impl/event/event.hpp" #include -using namespace tt; -using namespace tt_metal; -using MultiCommandQueueSingleDeviceFixture = ttnn::MultiCommandQueueSingleDeviceFixture; -using namespace constants; +namespace tt::tt_metal { +namespace { + +using MultiCommandQueueSingleDeviceFixture = ::ttnn::MultiCommandQueueSingleDeviceFixture; TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { Device* device = this->device_; @@ -40,16 +40,14 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { host_data[i] = bfloat16(static_cast(1)); } // Create golden data using tt_eager APIs - Tensor np_tensor = ttnn::numpy::full(input_shape.value, static_cast(1), DataType::BFLOAT16) - .to(Layout::TILE) - .to(device); + Tensor np_tensor = ttnn::full(input_shape, static_cast(1), DataType::BFLOAT16, Layout::TILE, *device_); ttnn::SmallVector reduce_dims = {3}; Tensor np_out = ttnn::moreh_sum(np_tensor, reduce_dims, false, std::nullopt, std::nullopt, std::nullopt); Tensor np_out_host = np_out.cpu(); const bfloat16* golden_output = std::get>(std::get(np_out_host.get_storage()).buffer).begin(); // Enable Asynchronous Execution and test ttnn runtime APIs - device->enable_async(true); + device_->enable_async(true); // Events for host - device synchronization auto write_event = std::make_shared(); auto workload_event = std::make_shared(); @@ -63,9 +61,9 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { output_buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(np_out.get_padded_shape())); auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device( - device, TensorSpec(input_shape.padded_shape(), tensor_layout)); + device_, TensorSpec(input_shape.padded_shape(), tensor_layout)); auto output_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device( - device, TensorSpec(np_out.get_padded_shape(), tensor_layout)); + device_, TensorSpec(np_out.get_padded_shape(), tensor_layout)); auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; auto output_storage = tt::tt_metal::DeviceStorage{output_buffer}; Tensor input_tensor = Tensor(input_storage, input_shape, DataType::BFLOAT16, Layout::TILE); @@ -73,13 +71,13 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { // Populate input_tensor with data ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Record the completion of the write event - ttnn::record_event(device->command_queue(io_cq), write_event); + ttnn::record_event(device_->command_queue(io_cq), write_event); // Host stalls until write is completed, before sending workload ttnn::event_synchronize(write_event); // Dispatch workload. Preallocated output_tensor is populated by op/ ttnn::moreh_sum(input_tensor, /*dim*/ 3, false, output_tensor, std::nullopt, std::nullopt); // Record completion of workload - ttnn::record_event(device->command_queue(workload_dispatch_cq), workload_event); + ttnn::record_event(device_->command_queue(workload_dispatch_cq), workload_event); ttnn::event_synchronize(workload_event); // Read output back, once workload is complete ttnn::read_buffer(io_cq, output_tensor, {readback_data}); @@ -93,7 +91,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { // Deallocate tensors (tensor gives up buffer). Done asynchronously, so sync on queue after. input_tensor.deallocate(); output_tensor.deallocate(); - ttnn::queue_synchronize(device->command_queue(io_cq)); + ttnn::queue_synchronize(device_->command_queue(io_cq)); // Buffer only has 2 owners in main thread. EXPECT_EQ(input_buffer.use_count(), 2); EXPECT_EQ(output_buffer.use_count(), 2); @@ -103,8 +101,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) { } TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) { - Device* device = this->device_; - device->enable_async(true); + device_->enable_async(true); MemoryConfig mem_cfg = MemoryConfig{ .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::DRAM, @@ -131,26 +128,26 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) { TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg); ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape)); auto input_buffer = - tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout)); + tt::tt_metal::tensor_impl::allocate_buffer_on_device(device_, TensorSpec(shape, tensor_layout)); auto input_storage = tt::tt_metal::DeviceStorage{input_buffer}; Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE); - ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Write using cq 1 - ttnn::record_event(device->command_queue(io_cq), write_event); // Record write on cq 1 + ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Write using cq 1 + ttnn::record_event(device_->command_queue(io_cq), write_event); // Record write on cq 1 // Wait until cq 1 write is complete - ttnn::wait_for_event(device->command_queue(workload_dispatch_cq), write_event); + ttnn::wait_for_event(device_->command_queue(workload_dispatch_cq), write_event); // Run operation on cq 0 Tensor output_tensor = ttnn::sqrt(workload_dispatch_cq, input_tensor); auto dummy_buffer_0 = - tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout)); + tt::tt_metal::tensor_impl::allocate_buffer_on_device(device_, TensorSpec(shape, tensor_layout)); output_tensor = ttnn::neg(workload_dispatch_cq, output_tensor); // Allocate this buffer to stress test async allocation across op execution and explicit allocation auto dummy_buffer_1 = - tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout)); + tt::tt_metal::tensor_impl::allocate_buffer_on_device(device_, TensorSpec(shape, tensor_layout)); // Record cq 0 prog execution - ttnn::record_event(device->command_queue(workload_dispatch_cq), workload_event); + ttnn::record_event(device_->command_queue(workload_dispatch_cq), workload_event); // Wait until cq 0 prog execution is done - ttnn::wait_for_event(device->command_queue(io_cq), workload_event); + ttnn::wait_for_event(device_->command_queue(io_cq), workload_event); // Read using cq 1 ttnn::read_buffer(io_cq, output_tensor, {readback_data}); for (int i = 0; i < buf_size_datums; i++) { @@ -166,8 +163,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeBufferDestructor) { // Test functionality for the buffer destructor, which will call deallocate asynchronously // We must ensure that the deallocate step, which can run after the buffer has been destroyed // does not rely on stale buffer state, after the buffer has been destroyed on host - Device* device = this->device_; - device->enable_async(true); + device_->enable_async(true); MemoryConfig mem_cfg = MemoryConfig{ .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::DRAM, @@ -182,9 +178,9 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeBufferDestructor) { TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg); TensorSpec tensor_spec(shape, tensor_layout); for (int loop = 0; loop < 100000; loop++) { - { - auto input_buffer_dummy = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec); - device->synchronize(); - } + auto input_buffer_dummy = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device_, tensor_spec); + device_->synchronize(); } } +} // namespace +} // namespace tt::tt_metal diff --git a/tests/ttnn/unit_tests/operations/test_creation.py b/tests/ttnn/unit_tests/operations/test_creation.py index f6f6773dc815..79f09ca122df 100644 --- a/tests/ttnn/unit_tests/operations/test_creation.py +++ b/tests/ttnn/unit_tests/operations/test_creation.py @@ -297,6 +297,39 @@ def test_arange(device, start, end, step): assert_with_pcc(torch_output_tensor, output_tensor, 0.9999) +@pytest.mark.parametrize( + "start", + [4, 8, 16, 32], +) +@pytest.mark.parametrize( + "end", + [100, 200, 300], +) +@pytest.mark.parametrize( + "step", + [1, 2, 3, 4, 5], +) +def test_arange_multi_device(mesh_device, start, end, step): + torch_input_tensor = torch.rand((start, end, step), dtype=torch.bfloat16) + torch_output_tensor = torch.arange(start, end, step) + + output_tensor = ttnn.arange( + torch_input_tensor.shape[0], + torch_input_tensor.shape[1], + torch_input_tensor.shape[2], + ttnn.bfloat16, + mesh_device, + ) + output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT) + output_tensor = ttnn.from_device(output_tensor) + output_tensors = ttnn.to_torch(output_tensor, mesh_composer=ttnn.ListMeshToTensor(mesh_device)) + for output_tensor in output_tensors: + output_tensor = output_tensor[-1, -1, -1, :] + if divup((end - start), step) % 2 != 0: + output_tensor = output_tensor[:-1] + assert_with_pcc(torch_output_tensor, output_tensor, 0.9999) + + @pytest.mark.parametrize( "input_shapes", [ diff --git a/ttnn/cpp/pybind11/operations/creation.hpp b/ttnn/cpp/pybind11/operations/creation.hpp index 6581766ccb06..26a5e1778c59 100644 --- a/ttnn/cpp/pybind11/operations/creation.hpp +++ b/ttnn/cpp/pybind11/operations/creation.hpp @@ -138,7 +138,7 @@ auto create_pybind_empty_like_overload() { const ttnn::Tensor& reference, const std::optional& dtype, const std::optional& layout, - const std::optional> device, + const std::optional> device, const std::optional& memory_config) -> ttnn::Tensor { return self(reference, dtype, layout, device, memory_config); }, @@ -150,6 +150,26 @@ auto create_pybind_empty_like_overload() { py::arg("memory_config") = ttnn::DRAM_MEMORY_CONFIG}; } +template +auto create_pybind_arange_overload() { + return ttnn::pybind_overload_t{ + [](const creation_operation_t& self, + const int64_t start, + const int64_t end, + const int64_t step, + const DataType dtype, + const std::optional> device, + const MemoryConfig& memory_config) -> ttnn::Tensor { + return self(start, end, step, dtype, device, memory_config); + }, + py::arg("start") = 0, + py::arg("end"), + py::arg("step") = 1, + py::arg("dtype") = DataType::BFLOAT16, + py::arg("device") = std::nullopt, + py::arg("memory_config") = ttnn::DRAM_MEMORY_CONFIG}; +} + template void bind_full_operation(py::module& module, const creation_operation_t& operation) { auto doc = fmt::format( @@ -350,22 +370,8 @@ void bind_arange_operation(py::module& module, const creation_operation_t& opera module, operation, doc, - ttnn::pybind_overload_t{ - [](const creation_operation_t& self, - const int64_t start, - const int64_t end, - const int64_t step, - const DataType dtype, - const std::optional>& device, - const MemoryConfig& memory_config) -> ttnn::Tensor { - return self(start, end, step, dtype, device, memory_config); - }, - py::arg("start") = 0, - py::arg("end"), - py::arg("step") = 1, - py::arg("dtype") = DataType::BFLOAT16, - py::arg("device") = std::nullopt, - py::arg("memory_config") = ttnn::DRAM_MEMORY_CONFIG}); + create_pybind_arange_overload(), + create_pybind_arange_overload()); } template diff --git a/ttnn/cpp/ttnn/operations/creation.hpp b/ttnn/cpp/ttnn/operations/creation.hpp index acd2914c98ff..3267e2dab295 100644 --- a/ttnn/cpp/ttnn/operations/creation.hpp +++ b/ttnn/cpp/ttnn/operations/creation.hpp @@ -69,6 +69,89 @@ inline std::vector get_workers_from_device(OptionalAnyDevice device) { return device.has_value() ? device->get_devices() : std::vector{}; } +template +static Tensor arange_impl( + const int64_t start, + const int64_t stop, + const int64_t step, + const Layout layout = Layout::ROW_MAJOR, + OptionalAnyDevice device = std::nullopt, + const MemoryConfig& output_mem_config = MemoryConfig{ + .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) { + constexpr DataType data_type = tt::tt_metal::convert_to_data_type(); + // Current implementation restrictions + TT_ASSERT(step > 0, "Step must be greater than 0"); + TT_ASSERT(start < stop, "Start must be less than step"); + auto size = tt::div_up((stop - start), step); + if (size % 2 != 0) { + size++; + } + auto owned_buffer = tt::tt_metal::owned_buffer::create(size); + + auto index = 0; + for (auto value = start; value < stop; value += step) { + if constexpr (std::is_same_v) { + owned_buffer[index++] = T(static_cast(value)); + } else { + owned_buffer[index++] = static_cast(value); + } + } + auto output = Tensor( + OwnedStorage{owned_buffer}, + ttnn::SimpleShape{1, 1, 1, static_cast(size)}, + data_type, + Layout::ROW_MAJOR) + .to(layout); + if (device.has_value()) { + output = output.to(device->get_devices(), output_mem_config); + } + return output; +} + +template +static Tensor full_impl( + uint8_t queue_id, + const tt::tt_metal::LegacyShape& shape, + T value, + const Layout layout, + const std::vector& devices, + const MemoryConfig& output_mem_config, + std::optional optional_output_tensor) { + constexpr DataType data_type = tt::tt_metal::convert_to_data_type(); + TensorSpec tensor_spec( + shape.logical_shape(), + TensorLayout::fromLegacyPaddedShape(data_type, PageConfig(layout), MemoryConfig{}, shape)); + auto owned_buffer = tt::tt_metal::owned_buffer::create(tensor_spec.padded_shape().volume()); + // TODO: 15061 - Generalize the header to support generic vector / view types. + std::fill(std::begin(owned_buffer), std::end(owned_buffer), value); + + if (!optional_output_tensor.has_value()) { + auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout); + if (!devices.empty()) { + output = output.to(devices, output_mem_config); + } + return output; + } else { + const auto buffers = optional_output_tensor->buffers(); + const bool using_fast_dispatch = (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr); + + for (auto* buffer : buffers) { + if (using_fast_dispatch) { + auto& cmd_queue = buffer->device()->command_queue(queue_id); + if (CommandQueue::default_mode() == CommandQueue::CommandQueueMode::ASYNC) { + tt::tt_metal::EnqueueWriteBuffer(cmd_queue, *buffer, owned_buffer.get_ptr(), /*blocking=*/false); + } else { + tt::tt_metal::EnqueueWriteBuffer(cmd_queue, *buffer, owned_buffer.data(), /*blocking=*/false); + } + } else { + tt::tt_metal::detail::WriteToBuffer(*buffer, owned_buffer.get()); + } + } + + return *optional_output_tensor; + } +} + } // namespace detail template @@ -122,8 +205,19 @@ inline ttnn::Tensor full_impl( MemoryConfig mem_cfg = optional_output_tensor.has_value() ? optional_output_tensor.value().memory_config() : memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG); - return numpy::full_impl( - queue_id, shape_value, fill_value, dtype_value, layout_value, workers, mem_cfg, optional_output_tensor); + auto concrete_full = [&](BufferType fill_value) { + return detail::full_impl( + queue_id, shape_value, fill_value, layout_value, workers, mem_cfg, optional_output_tensor); + }; + + switch (dtype_value) { + case DataType::UINT8: return concrete_full.template operator()(fill_value); + case DataType::UINT16: return concrete_full.template operator()(fill_value); + case DataType::UINT32: return concrete_full.template operator()(fill_value); + case DataType::FLOAT32: return concrete_full.template operator()(fill_value); + case DataType::BFLOAT16: return concrete_full.template operator()<::bfloat16>(static_cast(fill_value)); + default: TT_THROW("Unsupported DataType!"); + } } template @@ -287,10 +381,12 @@ struct EmptyLike { }; struct Full { + template + requires std::is_same_v or std::is_same_v static ttnn::Tensor invoke( uint8_t queue_id, const ttnn::Shape& shape, - const float fill_value, + const FillValueType fill_value, const std::optional& dtype = std::nullopt, const std::optional& layout = std::nullopt, detail::OptionalAnyDevice device = std::nullopt, @@ -307,48 +403,11 @@ struct Full { optional_output_tensor); } - static ttnn::Tensor invoke( - uint8_t queue_id, - const ttnn::Shape& shape, - const int fill_value, - const std::optional& dtype = std::nullopt, - const std::optional& layout = std::nullopt, - detail::OptionalAnyDevice device = std::nullopt, - const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt) { - return full_impl( - queue_id, - shape, - fill_value, - dtype, - layout, - detail::get_workers_from_device(device), - memory_config, - optional_output_tensor); - } - - static ttnn::Tensor invoke( - const ttnn::Shape& shape, - const float fill_value, - const std::optional& dtype = std::nullopt, - const std::optional& layout = std::nullopt, - detail::OptionalAnyDevice device = std::nullopt, - const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt) { - return full_impl( - ttnn::DefaultQueueId, - shape, - fill_value, - dtype, - layout, - detail::get_workers_from_device(device), - memory_config, - optional_output_tensor); - } - + template + requires std::is_same_v or std::is_same_v static ttnn::Tensor invoke( const ttnn::Shape& shape, - const int fill_value, + const FillValueType fill_value, const std::optional& dtype = std::nullopt, const std::optional& layout = std::nullopt, detail::OptionalAnyDevice device = std::nullopt, @@ -367,10 +426,12 @@ struct Full { }; struct FullLike { + template + requires std::is_same_v or std::is_same_v static ttnn::Tensor invoke( uint8_t queue_id, const ttnn::Tensor& tensor, - const float fill_value, + const FillValueType fill_value, const std::optional& dtype = std::nullopt, const std::optional& layout = std::nullopt, detail::OptionalAnyDevice device = std::nullopt, @@ -380,34 +441,11 @@ struct FullLike { queue_id, tensor, fill_value, dtype, layout, device, memory_config, optional_output_tensor); } + template + requires std::is_same_v or std::is_same_v static ttnn::Tensor invoke( - uint8_t queue_id, const ttnn::Tensor& tensor, - const int fill_value, - const std::optional& dtype = std::nullopt, - const std::optional& layout = std::nullopt, - detail::OptionalAnyDevice device = std::nullopt, - const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt) { - return full_like_impl( - queue_id, tensor, fill_value, dtype, layout, device, memory_config, optional_output_tensor); - } - - static ttnn::Tensor invoke( - const ttnn::Tensor& tensor, - const float fill_value, - const std::optional& dtype = std::nullopt, - const std::optional& layout = std::nullopt, - detail::OptionalAnyDevice device = std::nullopt, - const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt) { - return full_like_impl( - ttnn::DefaultQueueId, tensor, fill_value, dtype, layout, device, memory_config, optional_output_tensor); - } - - static ttnn::Tensor invoke( - const ttnn::Tensor& tensor, - const int fill_value, + const FillValueType fill_value, const std::optional& dtype = std::nullopt, const std::optional& layout = std::nullopt, detail::OptionalAnyDevice device = std::nullopt, @@ -418,12 +456,11 @@ struct FullLike { } }; -// TODO: #14974 - Onboard this API onto AnyDevice. struct Arange { static ttnn::Tensor invoke( const int64_t stop, const DataType dtype = DataType::BFLOAT16, - const std::optional>& device = std::nullopt, + detail::OptionalAnyDevice device = std::nullopt, const MemoryConfig& memory_config = ttnn::DRAM_MEMORY_CONFIG) { return Arange::invoke(0, stop, 1, dtype, device, memory_config); } @@ -433,20 +470,18 @@ struct Arange { const int64_t stop, const int64_t step = 1, const DataType dtype = ttnn::DataType::BFLOAT16, - const std::optional>& device_arg = std::nullopt, + detail::OptionalAnyDevice device = std::nullopt, const MemoryConfig& memory_config = ttnn::DRAM_MEMORY_CONFIG) { - Device* device = device_arg.has_value() ? &(device_arg.value().get()) : nullptr; + auto concrete_arange = [&]() { + return detail::arange_impl(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); + }; + switch (dtype) { - case DataType::BFLOAT16: - return numpy::arange<::bfloat16>(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); - case DataType::FLOAT32: - return numpy::arange(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); - case DataType::UINT16: - return numpy::arange(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); - case DataType::UINT32: - return numpy::arange(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); - case DataType::INT32: - return numpy::arange(start, stop, step, ttnn::ROW_MAJOR_LAYOUT, device, memory_config); + case DataType::BFLOAT16: return concrete_arange.template operator()<::bfloat16>(); + case DataType::FLOAT32: return concrete_arange.template operator()(); + case DataType::UINT16: return concrete_arange.template operator()(); + case DataType::UINT32: return concrete_arange.template operator()(); + case DataType::INT32: return concrete_arange.template operator()(); default: TT_THROW("Unsupported dtype"); } } diff --git a/ttnn/cpp/ttnn/operations/numpy/functions.hpp b/ttnn/cpp/ttnn/operations/numpy/functions.hpp index 31f1ec32efe1..51a3668eed1a 100644 --- a/ttnn/cpp/ttnn/operations/numpy/functions.hpp +++ b/ttnn/cpp/ttnn/operations/numpy/functions.hpp @@ -26,195 +26,6 @@ using tt::tt_metal::MemoryConfig; using tt::tt_metal::OwnedStorage; using tt::tt_metal::StorageType; using tt::tt_metal::Tensor; -namespace detail { - -template -constexpr static DataType get_data_type() { - if constexpr (std::is_same_v) { - return DataType::UINT8; - } else if constexpr (std::is_same_v) { - return DataType::UINT16; - } else if constexpr (std::is_same_v) { - return DataType::INT32; - } else if constexpr (std::is_same_v) { - return DataType::UINT32; - } else if constexpr (std::is_same_v) { - return DataType::FLOAT32; - } else if constexpr (std::is_same_v) { - return DataType::BFLOAT16; - } else { - TT_THROW("Unsupported DataType!"); - } -} - -template -static Tensor full( - uint8_t queue_id, - const tt::tt_metal::LegacyShape& shape, - T value, - const Layout layout, - const std::vector& devices, - const MemoryConfig& output_mem_config, - std::optional optional_output_tensor) { - constexpr DataType data_type = detail::get_data_type(); - TensorSpec tensor_spec( - shape.logical_shape(), - TensorLayout::fromLegacyPaddedShape(data_type, PageConfig(layout), MemoryConfig{}, shape)); - auto owned_buffer = tt::tt_metal::owned_buffer::create(tensor_spec.padded_shape().volume()); - // TODO: 15061 - Generalize the header to support generic vector / view types. - std::fill(std::begin(owned_buffer), std::end(owned_buffer), value); - - if (!optional_output_tensor.has_value()) { - auto output = Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout); - if (!devices.empty()) { - output = output.to(devices, output_mem_config); - } - return output; - } else { - const auto buffers = optional_output_tensor->buffers(); - const bool using_fast_dispatch = (std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr); - - for (auto* buffer : buffers) { - if (using_fast_dispatch) { - auto& cmd_queue = buffer->device()->command_queue(queue_id); - if (CommandQueue::default_mode() == CommandQueue::CommandQueueMode::ASYNC) { - tt::tt_metal::EnqueueWriteBuffer(cmd_queue, *buffer, owned_buffer.get_ptr(), /*blocking=*/false); - } else { - tt::tt_metal::EnqueueWriteBuffer(cmd_queue, *buffer, owned_buffer.data(), /*blocking=*/false); - } - } else { - tt::tt_metal::detail::WriteToBuffer(*buffer, owned_buffer.get()); - } - } - - return *optional_output_tensor; - } -} - -} // namespace detail - -template -static Tensor full_impl( - uint8_t queue_id, - const tt::tt_metal::LegacyShape& shape, - const T value, - const DataType data_type, - const Layout layout, - const std::vector& devices, - const MemoryConfig& output_mem_config, - std::optional optional_output_tensor) { - switch (data_type) { - case DataType::UINT8: { - return detail::full( - queue_id, shape, uint8_t(value), layout, devices, output_mem_config, optional_output_tensor); - } - case DataType::UINT16: { - return detail::full( - queue_id, shape, uint16_t(value), layout, devices, output_mem_config, optional_output_tensor); - } - case DataType::UINT32: { - return detail::full( - queue_id, shape, uint32_t(value), layout, devices, output_mem_config, optional_output_tensor); - } - case DataType::FLOAT32: { - return detail::full( - queue_id, shape, float(value), layout, devices, output_mem_config, optional_output_tensor); - } - case DataType::BFLOAT16: { - return detail::full<::bfloat16>( - queue_id, - shape, - ::bfloat16(static_cast(value)), - layout, - devices, - output_mem_config, - optional_output_tensor); - } - default: TT_THROW("Unsupported DataType!"); - } -} - -// TODO: #14974 - Can this be deleted, as it is only used in tests? -template -static Tensor full( - const tt::tt_metal::LegacyShape& shape, - const T value, - const DataType data_type, - const Layout layout = Layout::ROW_MAJOR, - Device* device = nullptr, - const MemoryConfig& output_mem_config = MemoryConfig{ - .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) { - return full_impl( - ttnn::DefaultQueueId, - shape, - value, - data_type, - layout, - device ? std::vector{device} : std::vector{}, - output_mem_config, - std::nullopt); -} - -// TODO: #14974 - Can this be deleted, as it is only used in tests? -static Tensor zeros( - const tt::tt_metal::LegacyShape& shape, - const DataType data_type = DataType::BFLOAT16, - const Layout layout = Layout::ROW_MAJOR, - Device* device = nullptr, - const MemoryConfig& output_mem_config = MemoryConfig{ - .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) { - return full(shape, 0.0f, data_type, layout, device, output_mem_config); -} - -// TODO: #14974 - Can this be deleted, as it is only used in tests? -static Tensor ones( - const tt::tt_metal::LegacyShape& shape, - const DataType data_type = DataType::BFLOAT16, - const Layout layout = Layout::ROW_MAJOR, - Device* device = nullptr, - const MemoryConfig& output_mem_config = MemoryConfig{ - .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) { - return full(shape, 1.0f, data_type, layout, device, output_mem_config); -} - -template -static Tensor arange( - const int64_t start, - const int64_t stop, - const int64_t step, - const Layout layout = Layout::ROW_MAJOR, - Device* device = nullptr, - const MemoryConfig& output_mem_config = MemoryConfig{ - .memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) { - constexpr DataType data_type = detail::get_data_type(); - // Current implementation restrictions - TT_ASSERT(step > 0, "Step must be greater than 0"); - TT_ASSERT(start < stop, "Start must be less than step"); - auto size = tt::div_up((stop - start), step); - if (size % 2 != 0) { - size++; - } - auto owned_buffer = tt::tt_metal::owned_buffer::create(size); - - auto index = 0; - for (auto value = start; value < stop; value += step) { - if constexpr (std::is_same_v) { - owned_buffer[index++] = T(static_cast(value)); - } else { - owned_buffer[index++] = static_cast(value); - } - } - auto output = Tensor( - OwnedStorage{owned_buffer}, - ttnn::SimpleShape{1, 1, 1, static_cast(size)}, - data_type, - Layout::ROW_MAJOR) - .to(layout); - if (device != nullptr) { - output = output.to(device, output_mem_config); - } - return output; -} template static Tensor index_trilu( @@ -671,7 +482,7 @@ static void seed(std::size_t seed) { RANDOM_GENERATOR = std::mt19937(seed); } template static Tensor uniform(T low, T high, const tt::tt_metal::LegacyShape& shape, const Layout layout = Layout::ROW_MAJOR) { - constexpr DataType data_type = detail::get_data_type(); + constexpr DataType data_type = tt::tt_metal::convert_to_data_type(); auto owned_buffer = tt::tt_metal::owned_buffer::create(tt::tt_metal::compute_volume(shape)); diff --git a/ttnn/cpp/ttnn/tensor/types.hpp b/ttnn/cpp/ttnn/tensor/types.hpp index 3666c7101139..ed8c8a95b145 100644 --- a/ttnn/cpp/ttnn/tensor/types.hpp +++ b/ttnn/cpp/ttnn/tensor/types.hpp @@ -41,6 +41,25 @@ enum class DataType { INVALID = 8, }; +template +consteval inline DataType convert_to_data_type() { + if constexpr (std::is_same_v) { + return DataType::UINT8; + } else if constexpr (std::is_same_v) { + return DataType::UINT16; + } else if constexpr (std::is_same_v) { + return DataType::INT32; + } else if constexpr (std::is_same_v) { + return DataType::UINT32; + } else if constexpr (std::is_same_v) { + return DataType::FLOAT32; + } else if constexpr (std::is_same_v) { + return DataType::BFLOAT16; + } else { + static_assert(tt::stl::concepts::always_false_v, "Unsupported DataType!"); + } +} + inline bool is_floating_point(DataType dtype) { switch (dtype) { case DataType::BFLOAT16: From fd2cd27f93eddeb7ec16cba1890ba1dbf0235314 Mon Sep 17 00:00:00 2001 From: Juan Camilo Vega Date: Mon, 9 Dec 2024 16:37:43 -0500 Subject: [PATCH 15/59] #15835: Adding tile support to unsqueeze OP (#15837) ### Ticket Link to Github Issue https://github.com/tenstorrent/tt-metal/issues/15835 ### Problem description We do the following to unsqueeze: Add Tile Support Add parameter validation Fix bugs in logic for handling dim=rank case Added testing to validate the changes Sweeps pass rate from 56% to 88.62% Remaining errors should be resolved with this PR https://github.com/tenstorrent/tt-metal/pull/15289 as they pertain to the 0 sized dimensions cases ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12240795525 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [x] New/Existing tests provide coverage for changes --- tests/ttnn/unit_tests/test_unsqueeze.py | 38 ++++++++++++++----- .../data_movement/unsqueeze/unsqueeze.cpp | 25 +++++++----- 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/tests/ttnn/unit_tests/test_unsqueeze.py b/tests/ttnn/unit_tests/test_unsqueeze.py index cbd10b3cc66c..40bf576cdf78 100644 --- a/tests/ttnn/unit_tests/test_unsqueeze.py +++ b/tests/ttnn/unit_tests/test_unsqueeze.py @@ -10,20 +10,40 @@ @pytest.mark.parametrize( - "input_shape, dim", + "input_shape, dim, layout", [ - ((1, 1, 256), 2), - ((1, 1, 256), -2), - ((1, 256), 1), - ((1, 1, 30), 2), - ((1, 1, 30), -2), - ((1, 30), 1), + ((1, 1, 253), 2, ttnn.ROW_MAJOR_LAYOUT), + ((1, 1, 253), -2, ttnn.ROW_MAJOR_LAYOUT), + ((1, 253), 1, ttnn.ROW_MAJOR_LAYOUT), + ((1, 1, 253), -2, ttnn.TILE_LAYOUT), + ((1, 253), 1, ttnn.TILE_LAYOUT), + ((57, 83), 1, ttnn.TILE_LAYOUT), + ((123, 259), -2, ttnn.TILE_LAYOUT), + ((57, 83), 1, ttnn.ROW_MAJOR_LAYOUT), + ((123, 259), -2, ttnn.ROW_MAJOR_LAYOUT), + ((8732,), 1, ttnn.ROW_MAJOR_LAYOUT), + ((8732,), -1, ttnn.ROW_MAJOR_LAYOUT), + ((8732,), 0, ttnn.ROW_MAJOR_LAYOUT), ], ) -def test_unsqueeze(device, input_shape, dim): +def test_unsqueeze(device, input_shape, dim, layout): torch_input_tensor = torch.rand(input_shape, dtype=torch.bfloat16) torch_unsqueeze_tensor = torch.unsqueeze(torch_input_tensor, dim) - input_tensor = ttnn.from_torch(torch_input_tensor, layout=ttnn.ROW_MAJOR_LAYOUT, device=device) + input_tensor = ttnn.from_torch(torch_input_tensor, layout=layout, device=device) ttnn_output = ttnn.unsqueeze(input_tensor, dim) torch_output_tensor = ttnn.to_torch(ttnn_output) assert torch.allclose(torch_output_tensor, torch_unsqueeze_tensor) + + +@pytest.mark.parametrize( + "input_shape, dim, layout", + [ + ((1, 1, 253), 4, ttnn.ROW_MAJOR_LAYOUT), + ((1, 1, 253), -5, ttnn.ROW_MAJOR_LAYOUT), + ], +) +def test_invalid_cases(device, input_shape, dim, layout): + torch_input_tensor = torch.rand(input_shape, dtype=torch.bfloat16) + input_tensor = ttnn.from_torch(torch_input_tensor, layout=layout, device=device) + with pytest.raises(RuntimeError): + ttnn.unsqueeze(input_tensor, dim) diff --git a/ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.cpp b/ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.cpp index 1ab05bd0bb9c..636b544ae2b4 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.cpp @@ -9,17 +9,24 @@ namespace ttnn::operations::data_movement { ttnn::Tensor UnsqueezeOperation::invoke(const ttnn::Tensor& input_tensor, const int dim) { const auto tensor_shape = input_tensor.get_shape(); - const auto rank = tensor_shape.rank(); - SmallVector output_shape_vector; + const uint32_t rank = tensor_shape.rank(); + const int32_t max_dim = (int)(rank); + const int32_t min_dim = -(max_dim)-1; - TT_FATAL( - input_tensor.get_layout() == Layout::ROW_MAJOR or (!tensor_shape.has_tile_padding()), - "Currently supporing ROW-MAJOR tensors or TILE tensors with no padding"); + SmallVector output_shape_vector; - int normal_dim = dim; + int normal_dim; // Handle negative dimension by converting it to positive + TT_FATAL( + (dim >= min_dim) && (dim <= max_dim), + "Dimension out of range (expected to be in range of [{},{}], but got {})", + min_dim, + max_dim, + dim); if (dim < 0) { - normal_dim += rank + 1; + normal_dim = rank + 1 + dim; + } else { + normal_dim = dim; } // Insert new dimension @@ -31,11 +38,11 @@ ttnn::Tensor UnsqueezeOperation::invoke(const ttnn::Tensor& input_tensor, const } // If the dimension is at the end, append it - if (normal_dim >= tensor_shape.size()) { + if (normal_dim == rank) { output_shape_vector.push_back(1); } - return ttnn::reshape(input_tensor, ttnn::SimpleShape(std::move(output_shape_vector))); + return ttnn::reshape(input_tensor, output_shape_vector); } } // namespace ttnn::operations::data_movement From 7415ed5713f3ee3891f9b73c1f98faa0b87f6e0e Mon Sep 17 00:00:00 2001 From: Denys Makoviichuk Date: Mon, 9 Dec 2024 13:51:20 -0800 Subject: [PATCH 16/59] [TT-Train]Added mappers updated tt tensor utils (#15637) ### Ticket TODO: add my ticket. ### Problem description Need to move distributed.py to the c++ using xtensor. ### What's changed * Added TensorToMesh and MeshToTensor * Added xtensor support * Added Indestructible ### Checklist - [x] Post commit CI passes - [x] Blackhole Post commit (if applicable) - [x] Model regression CI testing passes (if applicable) - [x] Device performance regression CI testing passes (if applicable) - [x] New/Existing tests provide coverage for changes https://github.com/tenstorrent/tt-metal/actions/runs/12243084710 --- tt-train/cmake/dependencies.cmake | 8 + tt-train/sources/ttml/CMakeLists.txt | 1 + .../sources/ttml/autograd/auto_context.cpp | 32 +- .../sources/ttml/autograd/auto_context.hpp | 14 +- .../sources/ttml/core/distributed_mapping.hpp | 283 ++++++++++++++++++ tt-train/sources/ttml/core/indestructible.hpp | 40 +++ tt-train/sources/ttml/core/mesh_device.cpp | 7 +- tt-train/sources/ttml/core/mesh_device.hpp | 4 +- .../sources/ttml/core/tt_tensor_utils.cpp | 67 ++++- .../sources/ttml/core/tt_tensor_utils.hpp | 53 +++- .../sources/ttml/core/ttnn_all_includes.hpp | 5 +- .../ttml/core/xtensor_all_includes.hpp | 18 ++ tt-train/sources/ttml/core/xtensor_utils.cpp | 65 ++++ tt-train/sources/ttml/core/xtensor_utils.hpp | 59 ++++ tt-train/tests/3rd_party/xtensor_test.cpp | 74 ++++- tt-train/tests/core/distributed_test.cpp | 245 +++++++++++++++ tt-train/tests/core/n300_utils_test.cpp | 167 +++++++++++ tt-train/tests/core/tensor_utils_test.cpp | 32 +- 18 files changed, 1145 insertions(+), 29 deletions(-) create mode 100644 tt-train/sources/ttml/core/distributed_mapping.hpp create mode 100644 tt-train/sources/ttml/core/indestructible.hpp create mode 100644 tt-train/sources/ttml/core/xtensor_all_includes.hpp create mode 100644 tt-train/sources/ttml/core/xtensor_utils.cpp create mode 100644 tt-train/sources/ttml/core/xtensor_utils.hpp create mode 100644 tt-train/tests/core/distributed_test.cpp create mode 100644 tt-train/tests/core/n300_utils_test.cpp diff --git a/tt-train/cmake/dependencies.cmake b/tt-train/cmake/dependencies.cmake index 8972da328919..2f6102a8c55a 100644 --- a/tt-train/cmake/dependencies.cmake +++ b/tt-train/cmake/dependencies.cmake @@ -58,6 +58,14 @@ CPMAddPackage(NAME xtl GITHUB_REPOSITORY xtensor-stack/xtl GIT_TAG 0.7.7 OPTIONS CPMAddPackage(NAME xtensor GITHUB_REPOSITORY xtensor-stack/xtensor GIT_TAG 0.25.0 OPTIONS "XTENSOR_ENABLE_TESTS OFF") +CPMAddPackage( + NAME xtensor-blas + GITHUB_REPOSITORY xtensor-stack/xtensor-blas + GIT_TAG 0.21.0 + OPTIONS + "XTENSOR_ENABLE_TESTS OFF" +) + include(${PROJECT_SOURCE_DIR}/cmake/fetch_msgpack.cmake) include(${PROJECT_SOURCE_DIR}/cmake/fetch_cli11.cmake) diff --git a/tt-train/sources/ttml/CMakeLists.txt b/tt-train/sources/ttml/CMakeLists.txt index 9919e85f89ce..0e241cd7bb67 100644 --- a/tt-train/sources/ttml/CMakeLists.txt +++ b/tt-train/sources/ttml/CMakeLists.txt @@ -96,6 +96,7 @@ target_link_libraries( magic_enum yaml-cpp::yaml-cpp xtensor + xtensor-blas xtl tokenizers_cpp wandbcpp diff --git a/tt-train/sources/ttml/autograd/auto_context.cpp b/tt-train/sources/ttml/autograd/auto_context.cpp index dbe16758b810..ea0e27e269b4 100644 --- a/tt-train/sources/ttml/autograd/auto_context.cpp +++ b/tt-train/sources/ttml/autograd/auto_context.cpp @@ -22,8 +22,8 @@ uint32_t AutoContext::get_seed() const { } AutoContext& AutoContext::get_instance() { - static AutoContext instance; - return instance; + static core::Indestructible instance{}; + return instance.get(); } std::optional AutoContext::add_backward_node(GradFunction&& grad_function, std::span links) { if (m_grads_mode == GradMode::DISABLED) { @@ -42,10 +42,36 @@ void AutoContext::reset_graph() { m_graph.reset(); } +void AutoContext::open_device() { + if (m_device) { + throw std::runtime_error("open_device was called after the device was created."); + } + m_device = std::make_unique(m_mesh_shape); +} + +void AutoContext::close_device() { + m_device = nullptr; +} + ttnn::distributed::MeshDevice& AutoContext::get_device() { - return device.get_device(); + if (!m_device) { + open_device(); + } + + return m_device->get_device(); } AutoContext::AutoContext() : m_generator(m_seed) { } + +void AutoContext::set_mesh_shape(tt::tt_metal::distributed::MeshShape shape) { + if (m_device) { + throw std::runtime_error("set_mesh_shape was called after the device was created."); + } + m_mesh_shape = shape; +} + +tt::tt_metal::distributed::MeshShape AutoContext::get_mesh_shape() const { + return m_mesh_shape; +} } // namespace ttml::autograd diff --git a/tt-train/sources/ttml/autograd/auto_context.hpp b/tt-train/sources/ttml/autograd/auto_context.hpp index 92002025cd73..a4124862ed31 100644 --- a/tt-train/sources/ttml/autograd/auto_context.hpp +++ b/tt-train/sources/ttml/autograd/auto_context.hpp @@ -4,8 +4,10 @@ #pragma once +#include #include +#include "core/indestructible.hpp" #include "core/mesh_device.hpp" #include "graph.hpp" @@ -40,6 +42,14 @@ class AutoContext { ~AutoContext() = default; // to make it work with unique_ptr. ttnn::distributed::MeshDevice& get_device(); + + void set_mesh_shape(tt::tt_metal::distributed::MeshShape shape); + [[nodiscard]] tt::tt_metal::distributed::MeshShape get_mesh_shape() const; + + void open_device(); + + void close_device(); + private: AutoContext(); uint32_t m_seed = 5489U; @@ -48,8 +58,10 @@ class AutoContext { GradMode m_grads_mode = GradMode::ENABLED; Graph m_graph; + tt::tt_metal::distributed::MeshShape m_mesh_shape = {1, 1}; + std::unique_ptr m_device; - core::MeshDevice device{0}; + friend class core::Indestructible; }; inline auto& ctx() { diff --git a/tt-train/sources/ttml/core/distributed_mapping.hpp b/tt-train/sources/ttml/core/distributed_mapping.hpp new file mode 100644 index 000000000000..102240e51e23 --- /dev/null +++ b/tt-train/sources/ttml/core/distributed_mapping.hpp @@ -0,0 +1,283 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include +#include + +#include "core/xtensor_utils.hpp" + +namespace ttml::core { +template +std::vector> chunk(const xt::xarray& tensor, int num_chunks, int dim) { + if (num_chunks <= 0) { + throw std::invalid_argument("num_chunks must be > 0"); + } + if (dim < 0 || static_cast(dim) >= tensor.dimension()) { + throw std::invalid_argument("invalid dimension index"); + } + + int size_along_dim = static_cast(tensor.shape()[dim]); + if (num_chunks > size_along_dim) { + throw std::invalid_argument("num_chunks cannot exceed the size of the tensor along the given dimension."); + } + + if (num_chunks == 1) { + return {tensor}; + } + + int chunk_size = (size_along_dim + num_chunks - 1) / num_chunks; + int remaining_size = size_along_dim; + + std::vector> chunks; + chunks.reserve(static_cast(num_chunks)); + + int start = 0; + int end = 0; + for (int i = 0; i < num_chunks && end < size_along_dim; ++i) { + int current_chunk_size = std::min(chunk_size, remaining_size); + remaining_size -= current_chunk_size; + end = start + current_chunk_size; + + // Build indices for slicing + xt::xstrided_slice_vector indices(tensor.dimension(), xt::all()); + indices[dim] = xt::range(start, end); + + auto chunk_view = xt::strided_view(tensor, indices); + + // Construct xarray from the view + // This forces a copy of that slice into a new xarray + chunks.push_back(xt::xarray(chunk_view)); + start = end; + } + + return chunks; +} + +template +class XTensorToMesh { +public: + XTensorToMesh(tt::tt_metal::distributed::MeshShape mesh_shape) : m_mesh_shape(std::move(mesh_shape)) { + } + + std::vector> map(const xt::xarray& tensor) const { + return static_cast(this)->map_impl(tensor); + } + + std::unordered_map config() const { + return static_cast(this)->config_impl(); + } + +protected: + tt::tt_metal::distributed::MeshShape m_mesh_shape; + + size_t get_num_devices() const { + return m_mesh_shape.first * m_mesh_shape.second; + } +}; + +template +class MeshToXTensor { +public: + MeshToXTensor(tt::tt_metal::distributed::MeshShape mesh_shape) : m_mesh_shape(std::move(mesh_shape)) { + } + + std::vector> compose(const std::vector>& tensors) const { + return static_cast(this)->compose_impl(tensors); + } + +protected: + tt::tt_metal::distributed::MeshShape m_mesh_shape; +}; + +template +class ShardXTensorToMesh : public XTensorToMesh, T> { +public: + using Base = XTensorToMesh, T>; + ShardXTensorToMesh(tt::tt_metal::distributed::MeshShape mesh_shape, int dim) : + Base(std::move(mesh_shape)), m_shard_dim(dim) { + } + + std::vector> map_impl(const xt::xarray& tensor) const { + int num_devices = Base::get_num_devices(); + auto sliced_tensors = chunk(tensor, num_devices, m_shard_dim); + return sliced_tensors; + } + + std::unordered_map config_impl() const { + return {{"strategy", "shard"}, {"shard_dim", std::to_string(m_shard_dim)}}; + } + +private: + int m_shard_dim = 0; +}; + +template +class ShardTensor2dMesh : public XTensorToMesh, T> { +public: + using Base = XTensorToMesh, T>; + ShardTensor2dMesh( + tt::tt_metal::distributed::MeshShape mesh_shape, + const std::pair, std::optional>& dims) : + Base(std::move(mesh_shape)), m_dims(dims) { + // We trust the provided mesh shape and do not validate against a MeshDevice. + } + + std::vector> map_impl(const xt::xarray& tensor) const { + if (!m_dims.first.has_value() && !m_dims.second.has_value()) { + throw std::invalid_argument("ShardTensor2dMesh requires at least one dimension to shard"); + } + + int rows = Base::m_mesh_shape.first; + int cols = Base::m_mesh_shape.second; + auto row_dim = m_dims.first; + auto col_dim = m_dims.second; + + std::vector> row_tensors; + + // Shard along rows + if (!row_dim.has_value()) { + row_tensors.reserve(rows); + for (int i = 0; i < rows; ++i) { + row_tensors.push_back(tensor); + } + } else { + row_tensors = chunk(tensor, rows, row_dim.value()); + } + + std::vector> tensor_shards; + tensor_shards.reserve(static_cast(rows * cols)); + // Shard along columns + if (!col_dim.has_value()) { + for (const auto& t : row_tensors) { + for (int i = 0; i < cols; ++i) { + tensor_shards.push_back(t); + } + } + } else { + for (const auto& t : row_tensors) { + auto col_chunks = chunk(t, cols, col_dim.value()); + tensor_shards.insert(tensor_shards.end(), col_chunks.begin(), col_chunks.end()); + } + } + + if (static_cast(tensor_shards.size()) != rows * cols) { + throw std::runtime_error(fmt::format( + "ShardTensor2dMesh: Sharding failed. Number of shards should match the product of the mesh " + "dimensions. Size: {}, rows: {}, cols: {}", + tensor_shards.size(), + rows, + cols)); + } + + return tensor_shards; + } + + std::unordered_map config_impl() const { + return { + {"strategy", "shard_2d"}, + {"mesh_shape_y", std::to_string(Base::m_mesh_shape.first)}, + {"mesh_shape_x", std::to_string(Base::m_mesh_shape.second)}}; + } + +private: + std::pair, std::optional> m_dims; +}; + +template +class ConcatMesh2dToTensor : public MeshToXTensor, T> { +public: + using Base = MeshToXTensor, T>; + ConcatMesh2dToTensor( + tt::tt_metal::distributed::MeshShape mesh_shape, const tt::tt_metal::distributed::MeshShape& dims) : + Base(std::move(mesh_shape)), m_dims(dims) { + if (m_dims.first == m_dims.second) { + throw std::invalid_argument("Dimensions in 'dims' must be different"); + } + } + + std::vector> compose_impl(const std::vector>& tensors) const { + int rows = Base::m_mesh_shape.first; + int cols = Base::m_mesh_shape.second; + size_t row_dim = m_dims.first; + size_t col_dim = m_dims.second; + + std::vector> row_concatenated; + row_concatenated.reserve(static_cast(rows)); + + for (int i = 0; i < rows; ++i) { + auto row_start = tensors.begin() + i * cols; + auto row_end = row_start + cols; + std::vector> row_tensors(row_start, row_end); + + auto concatenated_row = core::concatenate(row_tensors, col_dim); + row_concatenated.push_back(std::move(concatenated_row)); + } + + auto result = core::concatenate(row_concatenated, row_dim); + return {result}; + } + +private: + tt::tt_metal::distributed::MeshShape m_dims; +}; + +template +class ReplicateXTensorToMesh : public XTensorToMesh, T> { +public: + using Base = XTensorToMesh, T>; + ReplicateXTensorToMesh(tt::tt_metal::distributed::MeshShape mesh_shape) : Base(std::move(mesh_shape)) { + } + + std::vector> map_impl(const xt::xarray& tensor) const { + int num_devices = Base::get_num_devices(); + std::vector> tensors; + tensors.reserve(static_cast(num_devices)); + for (int i = 0; i < num_devices; ++i) { + tensors.push_back(tensor); // Note: this copies the tensor + } + return tensors; + } + + std::unordered_map config_impl() const { + int num_devices = Base::get_num_devices(); + return {{"strategy", "replicate"}, {"replication_factor", std::to_string(num_devices)}}; + } +}; + +template +class ConcatMeshToXTensor : public MeshToXTensor, T> { +public: + using Base = MeshToXTensor, T>; + ConcatMeshToXTensor(tt::tt_metal::distributed::MeshShape mesh_shape, int dim) : + Base(std::move(mesh_shape)), m_concat_dim(dim) { + } + + std::vector> compose_impl(const std::vector>& tensors) const { + return {core::concatenate(tensors, m_concat_dim)}; + } + +private: + int m_concat_dim = 0; +}; + +template +class VectorMeshToXTensor : public MeshToXTensor, T> { +public: + using Base = MeshToXTensor, T>; + VectorMeshToXTensor([[maybe_unused]] tt::tt_metal::distributed::MeshShape mesh_shape) : Base(mesh_shape) { + } + std::vector> compose_impl(const std::vector>& tensors) const { + return tensors; + } +}; + +template +using XTensorToMeshVariant = std::variant, ShardTensor2dMesh, ReplicateXTensorToMesh>; + +template +using MeshToXTensorVariant = std::variant, ConcatMesh2dToTensor, VectorMeshToXTensor>; + +} // namespace ttml::core diff --git a/tt-train/sources/ttml/core/indestructible.hpp b/tt-train/sources/ttml/core/indestructible.hpp new file mode 100644 index 000000000000..eb30d101bd2d --- /dev/null +++ b/tt-train/sources/ttml/core/indestructible.hpp @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include + +namespace ttml::core { + +template +class Indestructible { +public: + template + explicit Indestructible(Args&&... args) { + // Construct T in our aligned storage + new (&storage) T(std::forward(args)...); + } + + T& get() { + return *reinterpret_cast(&storage); + } + + const T& get() const { + return *reinterpret_cast(&storage); + } + + // Disable copy and assignment + Indestructible(const Indestructible&) = delete; + Indestructible& operator=(const Indestructible&) = delete; + + // Destructor does NOT call T's destructor. + // This leaves the object "indestructible." + ~Indestructible() = default; + +private: + // A buffer of unsigned char with alignment of T and size of T + alignas(T) unsigned char storage[sizeof(T)]; +}; + +} // namespace ttml::core diff --git a/tt-train/sources/ttml/core/mesh_device.cpp b/tt-train/sources/ttml/core/mesh_device.cpp index f30bbf9b8844..33f3d0265561 100644 --- a/tt-train/sources/ttml/core/mesh_device.cpp +++ b/tt-train/sources/ttml/core/mesh_device.cpp @@ -4,13 +4,11 @@ #include "mesh_device.hpp" -#include - namespace ttml::core { -MeshDevice::MeshDevice([[maybe_unused]] int device_index) : +MeshDevice::MeshDevice(tt::tt_metal::distributed::MeshShape shape) : m_mesh_device(ttnn::distributed::api::open_mesh_device( - ttnn::distributed::MeshShape(1, 1), + shape, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, /* num_command_queues*/ 1, @@ -25,6 +23,7 @@ MeshDevice::MeshDevice([[maybe_unused]] int device_index) : } MeshDevice::~MeshDevice() { + assert(m_mesh_device); ttnn::distributed::api::close_mesh_device(m_mesh_device); } diff --git a/tt-train/sources/ttml/core/mesh_device.hpp b/tt-train/sources/ttml/core/mesh_device.hpp index 1d38bbfe3bb6..490f9d5b45c2 100644 --- a/tt-train/sources/ttml/core/mesh_device.hpp +++ b/tt-train/sources/ttml/core/mesh_device.hpp @@ -4,14 +4,14 @@ #pragma once -#include #include +#include namespace ttml::core { // should I implement pimpl or its fine class MeshDevice { public: - explicit MeshDevice(int device_index); + explicit MeshDevice(tt::tt_metal::distributed::MeshShape shape); MeshDevice(MeshDevice&& device) = default; MeshDevice(const MeshDevice&) = delete; diff --git a/tt-train/sources/ttml/core/tt_tensor_utils.cpp b/tt-train/sources/ttml/core/tt_tensor_utils.cpp index d9f20c55ff1b..706c8d98dfc5 100644 --- a/tt-train/sources/ttml/core/tt_tensor_utils.cpp +++ b/tt-train/sources/ttml/core/tt_tensor_utils.cpp @@ -8,13 +8,14 @@ #include #include -#include #include #include #include #include #include +#include "core/xtensor_utils.hpp" + namespace { template @@ -180,6 +181,55 @@ tt::tt_metal::Tensor ones(const ttnn::Shape& shape, ttnn::distributed::MeshDevic return core::full(shape, 1.F, device, dtype); } +template +[[nodiscard]] tt::tt_metal::Tensor from_xtensors_to_host( + const std::vector>& buffers, const std::unordered_map& config) { + std::vector host_owned_buffers; + std::vector host_owned_shapes; + host_owned_buffers.reserve(buffers.size()); + host_owned_shapes.reserve(buffers.size()); + if (buffers.empty()) { + throw std::runtime_error("Cannot create a host buffer from an empty vector of xtensors!"); + } + auto first_shape = buffers.front().shape(); + for (int i = 0; i < buffers.size(); ++i) { + if (buffers[i].shape() != first_shape) { + throw std::runtime_error(fmt::format( + "Cannot create a host buffer from xtensors with different shapes: {} vs {}!", + get_shape_4d(buffers[0]), + get_shape_4d(buffers[i]))); + } + } + for (const auto& buffer : buffers) { + auto shape = create_shape(get_shape_4d(buffer)); + + if constexpr (std::is_same_v) { + auto owned_buffer = + create_owned_buffer_from_vector_of_floats(std::vector(buffer.begin(), buffer.end()), TensorType); + host_owned_buffers.push_back(owned_buffer); + } else { + auto owned_buffer = tt::tt_metal::owned_buffer::create(std::vector(buffer.begin(), buffer.end())); + host_owned_buffers.push_back(owned_buffer); + } + + host_owned_shapes.push_back(shape); + } + auto distributed_tensor_config = get_distributed_tensor_config(config); + auto storage = tt::tt_metal::MultiDeviceHostStorage( + distributed_tensor_config, std::move(host_owned_buffers), host_owned_shapes); + + // remove possible paddings from the shape (it conflicts with ROW MAJOR) + auto output = Tensor(std::move(storage), host_owned_shapes[0], TensorType, Layout::ROW_MAJOR); + return output; +} + +template tt::tt_metal::Tensor from_xtensors_to_host( + const std::vector>& buffers, const std::unordered_map& config); +template tt::tt_metal::Tensor from_xtensors_to_host( + const std::vector>& buffers, const std::unordered_map& config); +template tt::tt_metal::Tensor from_xtensors_to_host( + const std::vector>& buffers, const std::unordered_map& config); + template <> tt::tt_metal::Tensor from_vector( const std::vector& buffer, const ttnn::Shape& shape, ttnn::distributed::MeshDevice* device, Layout layout) { @@ -195,17 +245,10 @@ tt::tt_metal::Tensor from_vector( auto owned_buffer = create_owned_buffer_from_vector_of_floats(buffer, data_type); // remove possible paddings from the shape (it conflicts with ROW MAJOR) auto output = tt::tt_metal::Tensor(OwnedStorage{owned_buffer}, logical_shape, data_type, Layout::ROW_MAJOR); - - auto to_device_even_fast = [&]() { - output = ttnn::to_device(output, device, output_mem_config); - if (layout == Layout::TILE) { - output = ttnn::tilize_with_zero_padding(output, output_mem_config, std::nullopt, /* multicore */ true); - } - - return output; - }; - - output = to_device_even_fast(); + output = ttnn::to_device(output, device, output_mem_config); + if (layout == Layout::TILE) { + output = ttnn::tilize_with_zero_padding(output, output_mem_config, std::nullopt, /* multicore */ true); + } return output; } diff --git a/tt-train/sources/ttml/core/tt_tensor_utils.hpp b/tt-train/sources/ttml/core/tt_tensor_utils.hpp index 5d809935ea9e..6775bde4e6ca 100644 --- a/tt-train/sources/ttml/core/tt_tensor_utils.hpp +++ b/tt-train/sources/ttml/core/tt_tensor_utils.hpp @@ -5,9 +5,11 @@ #pragma once #include -#include +#include #include +#include "core/distributed_mapping.hpp" + namespace ttml::core { void print_tensor_stats(const tt::tt_metal::Tensor& tensor, const std::string& name); @@ -31,6 +33,10 @@ template ttnn::distributed::MeshDevice* device, Layout layout = Layout::TILE); +template +[[nodiscard]] tt::tt_metal::Tensor from_xtensors_to_host( + const std::vector>& buffers, const std::unordered_map& config); + template [[nodiscard]] std::vector to_vector(const tt::tt_metal::Tensor& tensor); @@ -38,4 +44,49 @@ template [[nodiscard]] ttnn::Shape create_shape(const std::array& args); +template +[[nodiscard]] tt::tt_metal::Tensor from_xtensor( + const xt::xarray& buffer, ttnn::distributed::MeshDevice* device, Layout layout = Layout::TILE) { + auto shape = create_shape(get_shape_4d(buffer)); + auto buffer_view = xtensor_to_span(buffer); + return from_vector(std::vector(buffer_view.begin(), buffer_view.end()), shape, device, layout); +} + +template +[[nodiscard]] xt::xarray to_xtensor(const tt::tt_metal::Tensor& tensor) { + auto vec = to_vector(tensor); + auto shape = tensor.get_shape().logical_shape(); + return span_to_xtensor(std::span(vec.data(), vec.size()), shape); +} + +template +auto to_xtensor(const tt::tt_metal::Tensor& tensor, const MeshToXTensorVariant& composer) { + auto cpu_tensor = tensor.cpu(); + cpu_tensor = cpu_tensor.to(Layout::ROW_MAJOR); + auto cpu_tensors = ttnn::distributed::api::get_device_tensors(cpu_tensor); + std::vector> res; + res.reserve(cpu_tensors.size()); + for (const auto& shard : cpu_tensors) { + res.push_back(to_xtensor(shard)); + } + return std::visit([&res](auto&& arg) { return arg.compose(res); }, composer); +} + +template +tt::tt_metal::Tensor from_xtensor( + const xt::xarray& tensor, + ttnn::distributed::MeshDevice* device, + const XTensorToMeshVariant& composer, + Layout layout = Layout::TILE) { + auto sharded_tensors = std::visit([&tensor](auto&& arg) { return arg.map(tensor); }, composer); + auto config = std::visit([](auto&& arg) { return arg.config(); }, composer); + auto output = from_xtensors_to_host(sharded_tensors, config); + MemoryConfig output_mem_config{}; + output = ttnn::to_device(output, device, output_mem_config); + if (layout == Layout::TILE) { + output = ttnn::tilize_with_zero_padding(output, output_mem_config, std::nullopt, /* multicore */ true); + } + return output; +} + } // namespace ttml::core diff --git a/tt-train/sources/ttml/core/ttnn_all_includes.hpp b/tt-train/sources/ttml/core/ttnn_all_includes.hpp index c01c7b804c28..d41cf6eea2fe 100644 --- a/tt-train/sources/ttml/core/ttnn_all_includes.hpp +++ b/tt-train/sources/ttml/core/ttnn_all_includes.hpp @@ -9,7 +9,8 @@ #pragma GCC diagnostic ignored "-Wdeprecated-volatile" #pragma GCC diagnostic ignored "-Wdeprecated-this-capture" -#include // NOLINT +#include // NOLINT +#include #include // NOLINT #include // NOLINT #include // NOLINT @@ -54,8 +55,10 @@ #include // NOLINT #include // NOLINT #include // NOLINT +#include // NOLINT #include // NOLINT #include // NOLINT #include // NOLINT #include // NOLINT + #pragma GCC diagnostic pop diff --git a/tt-train/sources/ttml/core/xtensor_all_includes.hpp b/tt-train/sources/ttml/core/xtensor_all_includes.hpp new file mode 100644 index 000000000000..12bdd2addb8f --- /dev/null +++ b/tt-train/sources/ttml/core/xtensor_all_includes.hpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include diff --git a/tt-train/sources/ttml/core/xtensor_utils.cpp b/tt-train/sources/ttml/core/xtensor_utils.cpp new file mode 100644 index 000000000000..96c0d0a7c1ff --- /dev/null +++ b/tt-train/sources/ttml/core/xtensor_utils.cpp @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "xtensor_utils.hpp" + +namespace ttml::core { +namespace detail { +template +auto vector_to_tuple_helper(const std::vector& v, std::index_sequence) { + return std::make_tuple(v[Indices]...); +} + +template +auto vector_to_tuple(const std::vector& buffer) { + assert(buffer.size() >= N); + return vector_to_tuple_helper(buffer, std::make_index_sequence()); +} + +template +xt::xarray concat_helper(const std::vector>& v, size_t axis = 0) { + constexpr int FIXED_N = N < 2 ? 2 : N; + if (N < 2) { + throw std::runtime_error("Tuple size in concatenate must be greater than 1"); + } + auto tuple = detail::vector_to_tuple(v); + return xt::concatenate(std::move(tuple), axis); +} + +template +consteval auto create_array_impl(std::index_sequence) { + return std::array (*)(const std::vector>& v, size_t axis), sizeof...(I)>{ + concat_helper...}; +} + +template +consteval auto create_array() { + return create_array_impl(std::make_index_sequence()); +} + +} // namespace detail + +template +xt::xarray concatenate(const std::vector>& v, size_t axis) { + constexpr size_t MAX_TUPLE_SIZE = 64; + + if (v.empty()) { + return {}; + } + if (v.size() == 1) { + return v.front(); + } + if (v.size() > MAX_TUPLE_SIZE) { + throw std::runtime_error( + fmt::format("Number of tensors to concatenate exceeds the maximum supported size {}", MAX_TUPLE_SIZE)); + } + constexpr auto table = detail::create_array(); + return (*table[v.size()])(v, axis); +} + +template xt::xarray concatenate(const std::vector>& v, size_t axis); +template xt::xarray concatenate(const std::vector>& v, size_t axis); +template xt::xarray concatenate(const std::vector>& v, size_t axis); +template xt::xarray concatenate(const std::vector>& v, size_t axis); +} // namespace ttml::core diff --git a/tt-train/sources/ttml/core/xtensor_utils.hpp b/tt-train/sources/ttml/core/xtensor_utils.hpp new file mode 100644 index 000000000000..153323f3e328 --- /dev/null +++ b/tt-train/sources/ttml/core/xtensor_utils.hpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +// TODO: decide if we want to use xarray everwhere or xtensor is ok +/* +Difference between xtensor and xarray: + +xarray : tensor that can be reshaped to any number of dimensions. xtensor : tensor with a number of dimensions +set to N at compile time. xtensor_fixed : tensor whose shape is fixed at compile time. +*/ + +namespace ttml::core { +template +xt::xarray span_to_xtensor(std::span vec, const ttnn::SimpleShape& shape) { + std::vector shape_vec(shape.cbegin(), shape.cend()); + return xt::adapt(vec.data(), vec.size(), xt::no_ownership(), shape_vec); +} +template +auto xtensor_to_span(const xt::xarray& xtensor) { + auto adaptor = xt::adapt(xtensor.data(), xtensor.size(), xt::no_ownership()); + return std::span(adaptor.data(), adaptor.size()); +} + +// TODO: decide if we want to keep this function with E or use the xtensor type directly +template +std::array get_shape_4d(const E& expr) { + const int max_dims = 4; + // TODO: Ensure that E is an xtensor expression + + // Retrieve the shape of the tensor + auto& expr_shape = expr.shape(); + std::array shape4d = {1, 1, 1, 1}; + + size_t dims = expr_shape.size(); + + if (dims > max_dims) { + throw std::runtime_error(fmt::format("Number of dimensions {} greater than max_shape {}", dims, max_dims)); + } + + // Copy the dimensions into the shape array + for (size_t i = 0; i < dims; ++i) { + shape4d[i + max_dims - dims] = static_cast(expr_shape[i]); + } + + return shape4d; +} + +template +xt::xarray concatenate(const std::vector>& v, size_t axis = 0); + +} // namespace ttml::core diff --git a/tt-train/tests/3rd_party/xtensor_test.cpp b/tt-train/tests/3rd_party/xtensor_test.cpp index ddd5c3b63fde..6a5b6317c17e 100644 --- a/tt-train/tests/3rd_party/xtensor_test.cpp +++ b/tt-train/tests/3rd_party/xtensor_test.cpp @@ -4,9 +4,9 @@ #include -#include -#include -#include +#include + +#include "core/xtensor_utils.hpp" TEST(XTensorTest, BasicOperations) { // Create an xtensor array @@ -27,3 +27,71 @@ TEST(XTensorTest, BasicOperations) { // Verify the result EXPECT_TRUE(xt::allclose(arr2, expected)); } + +TEST(XTensorTest, SpanToXtensor) { + std::vector data = {1, 2, 3, 4, 5, 6}; + std::span data_span(data.data(), data.size()); + ttnn::SimpleShape shape({2, 3}); + + auto result = ttml::core::span_to_xtensor(data_span, shape); + + // Check shape + EXPECT_EQ(result.shape().size(), 2); + EXPECT_EQ(result.shape()[0], 2); + EXPECT_EQ(result.shape()[1], 3); + + // Check data + int expected_val = 1; + for (size_t i = 0; i < result.shape()[0]; ++i) { + for (size_t j = 0; j < result.shape()[1]; ++j) { + EXPECT_EQ(result(i, j), expected_val++); + } + } +} + +// Test xtensor_to_span +TEST(XTensorTest, XtensorToSpan) { + xt::xarray arr = {{1.0f, 2.0f}, {3.0f, 4.0f}}; + auto span_result = ttml::core::xtensor_to_span(arr); + + EXPECT_EQ(span_result.size(), arr.size()); + + // Check data + size_t index = 0; + for (float val : arr) { + EXPECT_FLOAT_EQ(span_result[index++], val); + } +} + +// Test get_shape_4d +TEST(XTensorTest, GetShape4D) { + // Test a 4D shape + xt::xarray arr_4d = xt::xarray::from_shape({2, 3, 4, 5}); + auto shape4d = ttml::core::get_shape_4d(arr_4d); + EXPECT_EQ(shape4d[0], 2); + EXPECT_EQ(shape4d[1], 3); + EXPECT_EQ(shape4d[2], 4); + EXPECT_EQ(shape4d[3], 5); + + // Test a 2D shape, should zero-pad to the left (or right) as per logic + xt::xarray arr_2d = xt::xarray::from_shape({10, 20}); + auto shape2d = ttml::core::get_shape_4d(arr_2d); + // dims=2, so shape4d = {1, 1, 10, 20} + EXPECT_EQ(shape2d[0], 1); + EXPECT_EQ(shape2d[1], 1); + EXPECT_EQ(shape2d[2], 10); + EXPECT_EQ(shape2d[3], 20); + + // Test a 1D shape + xt::xarray arr_1d = xt::xarray::from_shape({7}); + auto shape1d = ttml::core::get_shape_4d(arr_1d); + // dims=1, so shape4d = {1, 1, 1, 7} + EXPECT_EQ(shape1d[0], 1); + EXPECT_EQ(shape1d[1], 1); + EXPECT_EQ(shape1d[2], 1); + EXPECT_EQ(shape1d[3], 7); + + // Test throwing an exception for >4D + xt::xarray arr_5d = xt::xarray::from_shape({2, 2, 2, 2, 2}); + EXPECT_THROW(ttml::core::get_shape_4d(arr_5d), std::runtime_error); +} diff --git a/tt-train/tests/core/distributed_test.cpp b/tt-train/tests/core/distributed_test.cpp new file mode 100644 index 000000000000..e273aaa4973d --- /dev/null +++ b/tt-train/tests/core/distributed_test.cpp @@ -0,0 +1,245 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include + +#include "core/distributed_mapping.hpp" + +template +class MeshOpsTest : public ::testing::Test { +protected: + // Common setup could go here if needed +}; + +using TestTypes = ::testing::Types; +TYPED_TEST_SUITE(MeshOpsTest, TestTypes); + +TYPED_TEST(MeshOpsTest, ChunkBasicNonDivisible3) { + // Create a 1D tensor: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + // Using TypeParam ensures we test both uint32_t and float. + xt::xarray tensor = xt::arange(10); + + // Chunk into 3 parts along dimension 0 + auto chunks = ttml::core::chunk(tensor, 3, 0); + + ASSERT_EQ(chunks.size(), 3u); + EXPECT_EQ(chunks[0].shape()[0], 4u); // first chunk size 4 + EXPECT_EQ(chunks[1].shape()[0], 4u); // next chunk size 4 + EXPECT_EQ(chunks[2].shape()[0], 2u); // last chunk size 2 +} + +TYPED_TEST(MeshOpsTest, ChunkBasicLessChunksThanProvided) { + // Create a 1D tensor: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12] + xt::xarray tensor = xt::arange(13); + + // Chunk into 6 parts along dimension 0 + auto chunks = ttml::core::chunk(tensor, 6, 0); + + ASSERT_EQ(chunks.size(), 5u); + EXPECT_EQ(chunks[0].shape()[0], 3u); // first chunk size 3 + EXPECT_EQ(chunks[1].shape()[0], 3u); // next chunk size 3 + EXPECT_EQ(chunks[2].shape()[0], 3u); // next chunk size 3 + EXPECT_EQ(chunks[3].shape()[0], 3u); // next chunk size 3 + EXPECT_EQ(chunks[4].shape()[0], 1u); // last chunk size 1 +} + +TYPED_TEST(MeshOpsTest, ShardXTensorToMeshBasicShard) { + tt::tt_metal::distributed::MeshShape mesh_shape = {1, 4}; + + // A simple 1D tensor to shard across 4 devices + auto tensor = xt::arange(8); // [0,...,7] + + ttml::core::ShardXTensorToMesh sharder(mesh_shape, 0); + auto shards = sharder.map(tensor); + + // With 4 shards, each shard should have size 2 + ASSERT_EQ(shards.size(), 4u); + for (auto& s : shards) { + EXPECT_EQ(s.size(), 2u); + } +} + +TYPED_TEST(MeshOpsTest, ShardTensor2dMeshTwoDimSharding) { + // Mesh shape: 2x2, total 4 devices + tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; + + // Create a 2D tensor shape: (4,4) + auto tensor = xt::arange(16).reshape({4, 4}); + + // Shard along row_dim=0 and col_dim=1 + ttml::core::ShardTensor2dMesh sharder(mesh_shape, {0, 1}); + auto shards = sharder.map(tensor); + + ASSERT_EQ(shards.size(), 4u); + // Check shapes of shards + for (auto& shard : shards) { + EXPECT_EQ(shard.shape()[0], 2u); + EXPECT_EQ(shard.shape()[1], 2u); + } +} + +TYPED_TEST(MeshOpsTest, ReplicateXTensorToMeshReplication) { + tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; + int num_devices = mesh_shape.first * mesh_shape.second; // 4 + + auto tensor = xt::arange(4); // [0,1,2,3] + + ttml::core::ReplicateXTensorToMesh replicator(mesh_shape); + auto replicas = replicator.map(tensor); + + ASSERT_EQ(static_cast(replicas.size()), num_devices); + for (const auto& t : replicas) { + EXPECT_TRUE(xt::allclose(t, tensor)); + } +} + +TYPED_TEST(MeshOpsTest, ConcatMesh2dToTensorRecomposition) { + tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; + + // Create shards that would come from a 4x4 tensor: + // Expected final tensor: + // [[0,1,2,3], + // [4,5,6,7], + // [8,9,10,11], + // [12,13,14,15]] + // + // Shards (2x2 each): + xt::xarray top_left = {{TypeParam(0), TypeParam(1)}, {TypeParam(4), TypeParam(5)}}; + xt::xarray top_right = {{TypeParam(2), TypeParam(3)}, {TypeParam(6), TypeParam(7)}}; + xt::xarray bot_left = {{TypeParam(8), TypeParam(9)}, {TypeParam(12), TypeParam(13)}}; + xt::xarray bot_right = {{TypeParam(10), TypeParam(11)}, {TypeParam(14), TypeParam(15)}}; + + std::vector> shards = {top_left, top_right, bot_left, bot_right}; + + ttml::core::ConcatMesh2dToTensor composer(mesh_shape, {0, 1}); + auto composed = composer.compose(shards); + + xt::xarray expected = { + {TypeParam(0), TypeParam(1), TypeParam(2), TypeParam(3)}, + {TypeParam(4), TypeParam(5), TypeParam(6), TypeParam(7)}, + {TypeParam(8), TypeParam(9), TypeParam(10), TypeParam(11)}, + {TypeParam(12), TypeParam(13), TypeParam(14), TypeParam(15)}}; + + EXPECT_TRUE(xt::allclose(composed[0], expected)); +} + +TYPED_TEST(MeshOpsTest, ConcatMeshToXTensorOneDimConcatenation) { + tt::tt_metal::distributed::MeshShape mesh_shape = {1, 3}; + + // Create a few shards: [0,1], [2,3], [4,5] + xt::xarray s1 = {TypeParam(0), TypeParam(1)}; + xt::xarray s2 = {TypeParam(2), TypeParam(3)}; + xt::xarray s3 = {TypeParam(4), TypeParam(5)}; + + std::vector> shards = {s1, s2, s3}; + ttml::core::ConcatMeshToXTensor composer(mesh_shape, 0); + auto composed = composer.compose(shards); + + xt::xarray expected = { + TypeParam(0), TypeParam(1), TypeParam(2), TypeParam(3), TypeParam(4), TypeParam(5)}; + EXPECT_TRUE(xt::allclose(composed[0], expected)); +} + +TYPED_TEST(MeshOpsTest, VectorMeshToXTensorVectorReturn) { + tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; + ttml::core::VectorMeshToXTensor vectorComposer(mesh_shape); + + std::vector> shards = { + xt::xarray({TypeParam(0), TypeParam(1)}), xt::xarray({TypeParam(2), TypeParam(3)})}; + + auto result = vectorComposer.compose(shards); + ASSERT_EQ(result.size(), shards.size()); + for (size_t i = 0; i < shards.size(); ++i) { + EXPECT_TRUE(xt::allclose(result[i], shards[i])); + } +} + +TEST(ConcatenateTest, DefaultAxis) { + xt::xarray a = {{1.0, 2.0}, {3.0, 4.0}}; + xt::xarray b = {{5.0, 6.0}, {7.0, 8.0}}; + std::vector> input = {a, b}; + + xt::xarray result = ttml::core::concatenate(input); // axis=0 by default + xt::xarray expected = {{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}, {7.0, 8.0}}; + + xt::allclose(result, expected); +} + +TEST(ConcatenateTest, AxisOne) { + xt::xarray x = {{1, 2, 3}, {4, 5, 6}}; + xt::xarray y = {{7, 8}, {9, 10}}; + std::vector> input = {x, y}; + + xt::xarray result = ttml::core::concatenate(input, 1); + xt::xarray expected = {{1, 2, 3, 7, 8}, {4, 5, 6, 9, 10}}; + + xt::allclose(result, expected); +} + +TEST(ConcatenateTest, MultipleArraysAxis0) { + xt::xarray a = {1.0f, 2.0f}; + xt::xarray b = {3.0f, 4.0f}; + xt::xarray c = {5.0f, 6.0f}; + std::vector> input = {a, b, c}; + + xt::xarray result = ttml::core::concatenate(input, 0); + xt::xarray expected = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + + xt::allclose(result, expected); +} + +TEST(ConcatenateTest, EmptyArray) { + xt::xarray a = {{1, 2}, {3, 4}}; + xt::xarray b; // Empty + std::vector> input = {a, b}; + + EXPECT_ANY_THROW({ xt::xarray result = ttml::core::concatenate(input, 0); }); +} + +TEST(ConcatenateTest, HigherDimensions) { + xt::xarray arr1 = xt::arange(1, 9); // 1 to 8 + arr1.reshape({2, 2, 2}); + xt::xarray arr2 = xt::arange(9, 17); // 9 to 16 + arr2.reshape({2, 2, 2}); + + std::vector> input = {arr1, arr2}; + xt::xarray result = ttml::core::concatenate(input, 0); + + // Expected: shape (4,2,2) with arr1 stacked over arr2 along axis 0 + xt::xarray expected = xt::concatenate(xt::xtuple(arr1, arr2), 0); + + xt::allclose(result, expected); +} + +TEST(ConcatenateTest, HigherAxis) { + xt::xarray arr1 = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}; + xt::xarray arr2 = {{{9, 10}, {11, 12}}, {{13, 14}, {15, 16}}}; + // Both have shape (2,2,2) + + std::vector> input = {arr1, arr2}; + xt::xarray result = ttml::core::concatenate(input, 2); + // Expected shape: (2,2,4) + xt::xarray expected = {{{1, 2, 9, 10}, {3, 4, 11, 12}}, {{5, 6, 13, 14}, {7, 8, 15, 16}}}; + + xt::allclose(result, expected); +} + +TYPED_TEST(MeshOpsTest, ConcatenateSameParametersAsCompose) { + tt::tt_metal::distributed::MeshShape mesh_shape = {1, 3}; + + // Create a few shards: [0,1], [2,3], [4,5] + xt::xarray s1 = {TypeParam(0), TypeParam(1)}; + xt::xarray s2 = {TypeParam(2), TypeParam(3)}; + xt::xarray s3 = {TypeParam(4), TypeParam(5)}; + + std::vector> shards = {s1, s2, s3}; + ttml::core::ConcatMeshToXTensor composer(mesh_shape, 0); + auto composed = ttml::core::concatenate(shards); + + xt::xarray expected = { + TypeParam(0), TypeParam(1), TypeParam(2), TypeParam(3), TypeParam(4), TypeParam(5)}; + EXPECT_TRUE(xt::allclose(composed, expected)); +} diff --git a/tt-train/tests/core/n300_utils_test.cpp b/tt-train/tests/core/n300_utils_test.cpp new file mode 100644 index 000000000000..7b376356b763 --- /dev/null +++ b/tt-train/tests/core/n300_utils_test.cpp @@ -0,0 +1,167 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include +#include + +#include "autograd/auto_context.hpp" +#include "core/compute_kernel_config.hpp" +#include "core/distributed_mapping.hpp" +#include "core/tt_tensor_utils.hpp" +#include "ttnn/operations/ccl/all_gather/all_gather.hpp" +#include "ttnn/operations/experimental/ccl/all_reduce/all_reduce.hpp" + +auto check_board_is_n300() { + return tt::Cluster::instance().get_board_type(0) == BoardType::N300; +} +class N300UtilsTest : public ::testing::Test { +protected: + void SetUp() override { + if (!check_board_is_n300()) { + GTEST_SKIP() << "Skipping N300 specific tests"; + } + ttml::autograd::ctx().set_mesh_shape({1, 2}); + ttml::autograd::ctx().open_device(); + } + + void TearDown() override { + ttml::autograd::ctx().close_device(); + } +}; + +TEST_F(N300UtilsTest, TestXTensorReplicate) { + auto* device = &ttml::autograd::ctx().get_device(); + auto mesh_shape = device->shape(); + xt::xarray test_data = {30.F, 20.F, 2.F}; + xt::xarray xtensor = test_data.reshape({1, 1, 1, 3}); + ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); + auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); + ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); + auto xtensors_back = ttml::core::to_xtensor(tensor, identity_composer); + + EXPECT_TRUE(xt::allclose(xtensor, xtensors_back[0])); + EXPECT_TRUE(xt::allclose(xtensor, xtensors_back[1])); +} + +TEST_F(N300UtilsTest, TestXTensorShardAxis3) { + auto* device = &ttml::autograd::ctx().get_device(); + auto mesh_shape = device->shape(); + + xt::xarray test_data = xt::arange(8); + xt::xarray xtensor = test_data.reshape({1, 1, 2, 4}); + + ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 3); + auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); + + ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); + auto xtensors_back = ttml::core::to_xtensor(tensor, identity_composer); + + xt::xarray chunk0 = xt::view(xtensor, xt::all(), xt::all(), xt::all(), xt::range(0, 2)); + xt::xarray chunk1 = xt::view(xtensor, xt::all(), xt::all(), xt::all(), xt::range(2, 4)); + + EXPECT_TRUE(xt::allclose(chunk0, xtensors_back[0])); + EXPECT_TRUE(xt::allclose(chunk1, xtensors_back[1])); +} + +TEST_F(N300UtilsTest, TestXTensorShardAxis2) { + auto* device = &ttml::autograd::ctx().get_device(); + auto mesh_shape = device->shape(); + + xt::xarray test_data = xt::arange(8); + xt::xarray xtensor = test_data.reshape({1, 1, 2, 4}); + + ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 2); + auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); + + ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); + auto xtensors_back = ttml::core::to_xtensor(tensor, identity_composer); + + xt::xarray chunk0 = xt::view(xtensor, xt::all(), xt::all(), xt::range(0, 1), xt::all()); + xt::xarray chunk1 = xt::view(xtensor, xt::all(), xt::all(), xt::range(1, 2), xt::all()); + + EXPECT_TRUE(xt::allclose(chunk0, xtensors_back[0])); + EXPECT_TRUE(xt::allclose(chunk1, xtensors_back[1])); +} + +TEST_F(N300UtilsTest, TestXTensorReplicateAllReduce) { + auto* device = &ttml::autograd::ctx().get_device(); + auto mesh_shape = device->shape(); + + xt::xarray xtensor = xt::random::rand({32 * 32}, -0.05, 0.05).reshape({1, 1, 32, 32}); + + ttml::core::XTensorToMeshVariant replicate_composer = ttml::core::ReplicateXTensorToMesh(mesh_shape); + auto tensor = ttml::core::from_xtensor(xtensor, device, replicate_composer); + + auto sum_tensor = ttnn::experimental::all_reduce( + tensor, ttnn::operations::reduction::ReduceType::Sum, 1, std::nullopt, ttnn::ccl::Topology::Ring); + ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); + + auto xtensors_back = ttml::core::to_xtensor(sum_tensor, identity_composer); + auto reduced_tensor = xtensor + xtensor; + + std::cout << "xtensors_back[0]: " << xtensors_back[0] << std::endl; + std::cout << "xtensors_back[1]: " << xtensors_back[1] << std::endl; + std::cout << "reduced_tensor: " << reduced_tensor << std::endl; + EXPECT_TRUE(xt::allclose(reduced_tensor, xtensors_back[0], /*rtol=*/1e-3, /*atol=*/1e-2)); + EXPECT_TRUE(xt::allclose(reduced_tensor, xtensors_back[1], /*rtol=*/1e-3, /*atol=*/1e-2)); +} + +TEST_F(N300UtilsTest, TestXTensorShardAxis2AddScalar) { + auto* device = &ttml::autograd::ctx().get_device(); + auto mesh_shape = device->shape(); + float scalar = 10.F; + xt::xarray test_data = xt::arange(8); + xt::xarray xtensor = test_data.reshape({1, 1, 2, 4}); + + ttml::core::XTensorToMeshVariant shard_composer = ttml::core::ShardXTensorToMesh(mesh_shape, 2); + auto tensor = ttml::core::from_xtensor(xtensor, device, shard_composer); + auto out_tensor = ttnn::add(tensor, scalar); + ttml::core::MeshToXTensorVariant identity_composer = ttml::core::VectorMeshToXTensor(mesh_shape); + auto xtensors_back = ttml::core::to_xtensor(out_tensor, identity_composer); + + xt::xarray chunk0 = xt::view(xtensor, xt::all(), xt::all(), xt::range(0, 1), xt::all()); + xt::xarray chunk1 = xt::view(xtensor, xt::all(), xt::all(), xt::range(1, 2), xt::all()); + + EXPECT_TRUE(xt::allclose(chunk0 + scalar, xtensors_back[0])); + EXPECT_TRUE(xt::allclose(chunk1 + scalar, xtensors_back[1])); +} + +TEST_F(N300UtilsTest, TestXTensorShardAxis3Matmul) { + xt::random::seed(42); + auto* device = &ttml::autograd::ctx().get_device(); + auto mesh_shape = device->shape(); + + xt::xarray xtensor_a = xt::random::rand({128 * 64}, -0.005, 0.005).reshape({1, 1, 128, 64}); + xt::xarray xtensor_b = xt::random::rand({256 * 64}, -0.005, 0.005).reshape({1, 1, 64, 256}); + + ttml::core::XTensorToMeshVariant replicate_composer2 = ttml::core::ShardXTensorToMesh(mesh_shape, 2); + ttml::core::XTensorToMeshVariant replicate_composer3 = ttml::core::ShardXTensorToMesh(mesh_shape, 3); + auto tensor_a = ttml::core::from_xtensor(xtensor_a, device, replicate_composer3); + auto tensor_b = ttml::core::from_xtensor(xtensor_b, device, replicate_composer3); + + auto gathered_ta = + ttnn::all_gather(tensor_a, 3 /*, {0, 4}, 1 ,std::nullopt, std::nullopt, std::nullopt, std::nullopt*/); + fmt::print("gathered_ta shape: {}\n", gathered_ta.get_shape().logical_shape()); + auto mul_tensor = ttnn::matmul( + gathered_ta, + tensor_b, + false, + false, + /* memory_config */ std::nullopt, + /* dtype */ std::nullopt, + /* program_config */ std::nullopt, + /* activation */ std::nullopt, + /* compute_kernel_config */ ttml::core::ComputeKernelConfig::precise(), + /* core_grid */ ttnn::CoreGrid{7, 8}, + /* output_tile */ std::nullopt); + ttml::core::MeshToXTensorVariant composer = ttml::core::ConcatMeshToXTensor(mesh_shape, 3); + auto xtensors_back = ttml::core::to_xtensor(mul_tensor, composer); + xt::xarray mul_res = xt::linalg::dot(xtensor_a, xtensor_b); + + // (128, 64) X (64, 256) => (128, 256) + EXPECT_TRUE(xt::allclose(mul_res, xtensors_back[0], /*rtol=*/1e-3, /*atol=*/1e-2)); +} diff --git a/tt-train/tests/core/tensor_utils_test.cpp b/tt-train/tests/core/tensor_utils_test.cpp index 196cfb8fff23..72e518de0912 100644 --- a/tt-train/tests/core/tensor_utils_test.cpp +++ b/tt-train/tests/core/tensor_utils_test.cpp @@ -6,12 +6,11 @@ #include #include -#include #include #include "autograd/auto_context.hpp" -#include "core/device.hpp" #include "core/tt_tensor_utils.hpp" +#include "core/xtensor_utils.hpp" TEST(TensorUtilsTest, TestFloatToFromTensorEven) { auto* device = &ttml::autograd::ctx().get_device(); @@ -212,3 +211,32 @@ TEST(TensorUtilsTest, TestZerosLike) { EXPECT_EQ(val, 0.F); } } + +TEST(TensorUtilsTest, TestFloatXtensor) { + auto* device = &ttml::autograd::ctx().get_device(); + std::vector test_data = {30.F, 20.F, 2.F}; + + auto shape = ttml::core::create_shape({1, 1, 1, 3}); + + xt::xarray xtensor = + ttml::core::span_to_xtensor(std::span{test_data.data(), test_data.size()}, shape.logical_shape()); + auto tensor = ttml::core::from_xtensor(xtensor, device); + + auto xtensor_back = ttml::core::to_xtensor(tensor); + + EXPECT_TRUE(xt::allclose(xtensor, xtensor_back)); +} + +TEST(TensorUtilsTest, TestUint32XTensor) { + auto* device = &ttml::autograd::ctx().get_device(); + std::vector test_data = {30, 20, 2}; + + auto shape = ttml::core::create_shape({1, 1, 1, 3}); + xt::xarray xtensor = + ttml::core::span_to_xtensor(std::span{test_data.data(), test_data.size()}, shape.logical_shape()); + auto tensor = ttml::core::from_xtensor(xtensor, device); + + auto xtensor_back = ttml::core::to_xtensor(tensor); + + EXPECT_TRUE(xt::allclose(xtensor, xtensor_back)); +} From dc5407b325aad4ca9ad6782197054b1a04b89aca Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Mon, 9 Dec 2024 17:02:01 -0500 Subject: [PATCH 17/59] #0: Migrate usage of ttnn::numpy::zeros in test_tensor_utils (#15847) ### Ticket #14974 ### Problem description https://github.com/tenstorrent/tt-metal/pull/15671 removes `ttnn::numpy::zeros`, it was re-based without updating the newly introduced call in `test_tensor_utils.cpp`. ### What's changed Migrated `ttnn::numpy::zeros` to `ttnn::zeros` in `test_tensor_utils.cpp` ### Checklist - [X] New/Existing tests provide coverage for changes --- tests/tt_eager/ops/test_tensor_utils.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/tt_eager/ops/test_tensor_utils.cpp b/tests/tt_eager/ops/test_tensor_utils.cpp index 1121b455b2a4..0b6c2e3d376a 100644 --- a/tests/tt_eager/ops/test_tensor_utils.cpp +++ b/tests/tt_eager/ops/test_tensor_utils.cpp @@ -11,14 +11,11 @@ #include "ttnn/tensor/host_buffer/types.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/creation.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/tensor/tensor_utils.hpp" -using std::vector; -using tt::tt_metal::Tensor; -using namespace tt::tt_metal; -static vector> ref_weight_in = { +static std::vector> ref_weight_in = { { 16140, 16151, 16183, 16216, 16154, 16219, 16139, 16216, 16088, 16159, 16165, 16068, 16096, 16024, 16228, 15720, 16246, 16011, 16068, 16116, 16202, 16207, 16135, 16117, 16145, 16073, 16236, 16214, 15761, 16044, 15794, 16165, @@ -246,7 +243,7 @@ static vector> ref_weight_in = { } }; -static vector> ref_weight_out = { +static std::vector> ref_weight_out = { {16140, 16151, 16183, 16216, 16154, 16219, 16139, 16216, 16088, 16156, 15971, 16157, 16069, 16241, 16231, 16174, 16102, 16056, 16250, 15716, 16154, 16102, 16189, 15523, 15648, 16098, 16016, 15972, 16228, 16243, 16174, 16100, 16101, 16216, 16250, 16179, 16206, 16137, 16180, 16101, 15821, 15819, 16235, 16052, 16182, 15912, 16128, 16159, @@ -419,9 +416,11 @@ static vector> ref_weight_out = { 15832, 15895, 16234, 16062, 16231, 16173, 16122, 16016, 16187, 15560, 16229, 16046, 16243, 16219, 15849, 16135, }}; -static vector weight_tensor_shape = {{8, 8, 3, 3}, {10, 10, 3, 3}, {12, 8, 3, 3}, {8, 15, 3, 3}}; -static vector bias_tensor_shape = {{1, 1, 1, 32}, {1, 1, 1, 60}, {12, 1, 1, 320}, {8, 1, 1, 48}}; -static vector shards = {8, 3, 5, 4}; +static std::vector weight_tensor_shape = { + {8, 8, 3, 3}, {10, 10, 3, 3}, {12, 8, 3, 3}, {8, 15, 3, 3}}; +static std::vector bias_tensor_shape = { + {1, 1, 1, 32}, {1, 1, 1, 60}, {12, 1, 1, 320}, {8, 1, 1, 48}}; +static std::vector shards = {8, 3, 5, 4}; template static uint32_t compare_out_with_ref(const owned_buffer::Buffer& out_buf, T& ref) { @@ -447,7 +446,7 @@ static uint32_t compare_out_with_ref(const owned_buffer::Buffer& out_b static void test_convert_conv_weight_tensor_to_tiled_layout_block_sharded() { tt::log_info(tt::LogTest, "Running {}", __func__); for (auto i = 0; i < weight_tensor_shape.size(); i++) { - auto input_tensor = ttnn::numpy::zeros(weight_tensor_shape[i]); + auto input_tensor = ttnn::zeros(weight_tensor_shape[i]); auto input_buffer = owned_buffer::get_as(input_tensor); for (auto j = 0; j < input_buffer.size(); j++) { input_buffer[j] = ref_weight_in[i][j]; From 7b3c14903360fbfedaa15a76ab41884812a762f2 Mon Sep 17 00:00:00 2001 From: Colman Glagovich <114512306+cglagovichTT@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:12:19 -0500 Subject: [PATCH 18/59] Llama3-Vision feature-complete (batch 32 / paged attention / tracing) (#15830) This commit enables features for the Llama-Vision model: Batch > 1 support Tracing Paged attention New RoPE In addition, it refactors the generation code for Llama Text and Llama Vision. This is to help vLLM integration for all models in this shared impl. --- .github/workflows/t3000-demo-tests-impl.yaml | 2 +- .../workflows/t3000-frequent-tests-impl.yaml | 4 +- .github/workflows/t3000-unit-tests-impl.yaml | 4 +- models/demos/llama3/demo/demo.py | 36 +-- .../demos/llama3/demo/multimodal_demo_chat.py | 4 +- .../demos/llama3/demo/multimodal_demo_text.py | 4 +- .../demos/llama3/demo/simple_vision_demo.py | 156 +++++----- models/demos/llama3/lt | 2 +- .../multimodal/test_llama_cross_attention.py | 40 ++- ..._llama_cross_attention_transformer_text.py | 48 ++- .../multimodal/test_llama_cross_block.py | 32 +- .../multimodal/test_llama_image_attention.py | 2 +- .../multimodal/test_llama_image_block.py | 2 +- .../test_llama_image_transformer.py | 2 +- .../multimodal/test_llama_vision_model.py | 154 --------- .../tests/test_interleaved_to_sharded.py | 2 +- .../demos/llama3/tests/test_llama_accuracy.py | 33 +- .../llama3/tests/test_llama_attention.py | 5 +- .../tests/test_llama_attention_prefill.py | 2 +- .../demos/llama3/tests/test_llama_decoder.py | 5 +- .../tests/test_llama_decoder_prefill.py | 2 +- models/demos/llama3/tests/test_llama_model.py | 18 +- .../llama3/tests/test_llama_model_prefill.py | 16 +- .../vision_generator.py => generator.py} | 291 +++++++++++++++--- models/demos/llama3/tt/generator_vllm.py | 77 +++++ models/demos/llama3/tt/llama_attention.py | 61 +++- models/demos/llama3/tt/llama_common.py | 47 +++ models/demos/llama3/tt/llama_decoder.py | 6 +- models/demos/llama3/tt/llama_embedding.py | 2 +- models/demos/llama3/tt/llama_model.py | 185 ++++++++++- models/demos/llama3/tt/llama_rope.py | 16 +- models/demos/llama3/tt/model_config.py | 36 +-- .../tt/multimodal/llama_cross_attention.py | 36 +-- .../llama_cross_attention_transformer_text.py | 42 ++- .../llama3/tt/multimodal/llama_cross_block.py | 5 + .../tt/multimodal/llama_image_transformer.py | 6 +- .../tt/multimodal/llama_vision_model.py | 189 ++++++------ models/perf/benchmarking_utils.py | 18 ++ tests/scripts/t3000/run_t3000_demo_tests.sh | 2 +- tests/scripts/t3000/run_t3000_unit_tests.sh | 8 +- .../misc/test_scaled_dot_product_attention.py | 39 ++- .../sdpa/device/kernels/compute/sdpa.cpp | 4 +- .../kernels/dataflow/reader_interleaved.cpp | 36 +-- .../kernels/dataflow/writer_interleaved.cpp | 6 +- .../transformer/sdpa/device/sdpa_op.cpp | 41 +-- .../sdpa/device/sdpa_program_factory.cpp | 120 ++++---- 46 files changed, 1112 insertions(+), 736 deletions(-) delete mode 100644 models/demos/llama3/tests/multimodal/test_llama_vision_model.py rename models/demos/llama3/tt/{multimodal/vision_generator.py => generator.py} (59%) create mode 100644 models/demos/llama3/tt/generator_vllm.py diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml index f71636bdb153..9ad4ab1b818c 100644 --- a/.github/workflows/t3000-demo-tests-impl.yaml +++ b/.github/workflows/t3000-demo-tests-impl.yaml @@ -16,7 +16,7 @@ jobs: test-group: [ { name: "t3k_falcon40b_tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 50, owner_id: U053W15B6JF}, #Djordje Ivanovic { name: "t3k_llama3_tests", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 30, owner_id: U03PUAKE719}, # Miguel Tairum - # { name: "t3k_llama3_vision_tests", arch: wormhole_b0, cmd: run_t3000_llama3_vision_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k_llama3_vision_tests", arch: wormhole_b0, cmd: run_t3000_llama3_vision_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich { name: "t3k_llama3_70b_tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich { name: "t3k_falcon7b_tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 90, owner_id: U05RWH3QUPM}, #Salar Hosseini { name: "t3k_mixtral_tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 50, owner_id: U03PUAKE719}, # Miguel Tairum diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml index 542e85187c69..fde2ede1652e 100644 --- a/.github/workflows/t3000-frequent-tests-impl.yaml +++ b/.github/workflows/t3000-frequent-tests-impl.yaml @@ -18,8 +18,8 @@ jobs: { name: "t3k ethernet tests", arch: wormhole_b0, cmd: run_t3000_ethernet_tests, timeout: 60, owner_id: ULMEPM2MA}, #Sean Nijjar { name: "t3k trace stress tests", arch: wormhole_b0, cmd: run_t3000_trace_stress_tests, timeout: 120, owner_id: U03NG0A5ND7}, #Aditya Saigal { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 120, owner_id: U04S2UV6L8N}, #Sofija Jovic - # { name: "t3k llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich - # { name: "t3k n300 mesh llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k n300 mesh llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich { name: "t3k llama3 tests", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 45, owner_id: U03PUAKE719}, #Miguel Tairum Cruz { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich # { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich # FIXME issue #14934 diff --git a/.github/workflows/t3000-unit-tests-impl.yaml b/.github/workflows/t3000-unit-tests-impl.yaml index f05ee8e78108..303de478fd71 100644 --- a/.github/workflows/t3000-unit-tests-impl.yaml +++ b/.github/workflows/t3000-unit-tests-impl.yaml @@ -20,8 +20,8 @@ jobs: { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 30, owner_id: U053W15B6JF}, #Djordje Ivanovic { name: "t3k llama3-small tests", arch: wormhole_b0, cmd: run_t3000_llama3-small_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz { name: "t3k llama3.2-11b tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz - # { name: "t3k llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_unit_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich - # { name: "t3k n300 mesh llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_unit_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_unit_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k n300 mesh llama3.2-11b-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_unit_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich { name: "t3k llama3.1-70b tests", arch: wormhole_b0, cmd: run_t3000_llama3.1-70b_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, #Miguel Tairum Cruz { name: "t3k grok tests", arch: wormhole_b0, cmd: run_t3000_grok_tests, timeout: 30, owner_id: U03HY7MK4BT}, #Mark O'Connor diff --git a/models/demos/llama3/demo/demo.py b/models/demos/llama3/demo/demo.py index a828830d3309..f3b5b998fcb1 100644 --- a/models/demos/llama3/demo/demo.py +++ b/models/demos/llama3/demo/demo.py @@ -26,7 +26,6 @@ ) from models.demos.llama3.tt.llama_model import TtTransformer from models.demos.llama3.tt.llama_embedding import TtLlamaEmbedding -from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from models.demos.llama3.tt.model_config import TtModelArgs @@ -227,6 +226,7 @@ def run_llama3_demo( optimizations=optimizations, max_seq_len=max_seq_len, ) + tokenizer = Tokenizer(model_args.tokenizer_path) # Check max sequence length compatibility with model and architecture. Refer to README for more information @@ -259,34 +259,13 @@ def run_llama3_demo( ), "T3K only supports a max context length of 128k tokens for Llama3.1-8B and Llama3.2-11B" if llama_model_name == "3.1-70B": assert tt_device_name in ["T3K", "TG"], "Llama3.1-70B is only supported on T3K or TG" + assert max_seq_len <= 64 * 1024, "T3K only supports a max context length of 64k tokens for Llama3.1-70B" logger.info("Loading weights...") profiler.start("weight_loading") state_dict = model_args.load_state_dict() profiler.end("weight_loading") - # Setup RoPE transformation matrices - rope_setup = TtLlamaRotarySetup( - mesh_device, - batch_size, - model_args.head_dim, - model_args.max_seq_len, - model_args.rope_theta, - model_args.use_scaled_rope, - ) - transformation_mats_decode = rope_setup.get_trans_mats() - - transformation_mats_prefill_torch = get_rot_transformation_mat(model_args.head_dim) - transformation_mats_prefill = ttnn.from_torch( - transformation_mats_prefill_torch, - dtype=ttnn.bfloat16, - layout=ttnn.TILE_LAYOUT, - device=mesh_device, - memory_config=ttnn.DRAM_MEMORY_CONFIG, - mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), - ) - transformation_mats = {"decode": transformation_mats_decode, "prefill": transformation_mats_prefill} - page_table_tt = None if paged_attention: @@ -314,7 +293,6 @@ def run_llama3_demo( dtype=dtype, state_dict=state_dict, weight_cache_path=model_args.weight_cache_path(dtype), - transformation_mats=transformation_mats, paged_attention_config=paged_attention_config, ) tt_embd = TtLlamaEmbedding( @@ -384,7 +362,7 @@ def run_llama3_demo( :, decoding_pos[batch_id] :, : ] = 0 # Zero out the tokens after the prefill length - prefill_input = model_args.prepare_inputs_ttnn_prefill( + prefill_input = model_args.prepare_residual_tensor_prefill( pt_prefill_input[batch_id], ) @@ -476,7 +454,7 @@ def run_llama3_demo( ) # Get cos/sin matrices for the current position of each user - rot_mats, rot_mat_idxs = rope_setup.get_rot_mats(current_pos, return_rot_idxs=True) + rot_mats, rot_mat_idxs = tt_model.rope_setup.get_rot_mats(current_pos, return_rot_idxs=True) # Compile logger.info(f"Compiling model trace...") decode_input = ttnn.unsqueeze_to_4D(tt_embd(tt_out_tok)) @@ -519,7 +497,7 @@ def run_llama3_demo( decode_input = ttnn.unsqueeze_to_4D(tt_embd(tt_out_tok)) decode_input = ttnn.to_memory_config(decode_input, tt_model.args.model_config["DECODE_RESIDUAL_MEMCFG"]) - rot_mats = rope_setup.get_rot_mats(rot_mat_idxs) + rot_mats = tt_model.rope_setup.get_rot_mats(rot_mat_idxs) tt_out = tt_model( decode_input, current_pos_tensor, @@ -562,7 +540,7 @@ def run_llama3_demo( # Reset the current position and output token tensors for the real decode run ttnn.copy_host_to_device_tensor(current_pos_reset, current_pos_tensor) ttnn.copy_host_to_device_tensor(tt_out_tok_reset, tt_out_tok) - rot_mat_idxs_reset = rope_setup.get_rot_idxs(current_pos, on_host=True) + rot_mat_idxs_reset = tt_model.rope_setup.get_rot_idxs(current_pos, on_host=True) ttnn.copy_host_to_device_tensor(rot_mat_idxs_reset, rot_mat_idxs) profiler.end(f"capture_trace_{batch_idx}") @@ -591,7 +569,7 @@ def run_llama3_demo( # TODO This is required for now since we cannot ttnn.plus_one(rot_mat_idxs) while it being uint32. # If this tensor is int32, it won't be supported by ttnn.embedding current_pos += 1 - rot_mat_idxs_updated = rope_setup.get_rot_idxs(current_pos, on_host=True) + rot_mat_idxs_updated = tt_model.rope_setup.get_rot_idxs(current_pos, on_host=True) ttnn.copy_host_to_device_tensor(rot_mat_idxs_updated, rot_mat_idxs) # Write to host diff --git a/models/demos/llama3/demo/multimodal_demo_chat.py b/models/demos/llama3/demo/multimodal_demo_chat.py index ca3d5b498e3e..ac7c5a60b2e2 100644 --- a/models/demos/llama3/demo/multimodal_demo_chat.py +++ b/models/demos/llama3/demo/multimodal_demo_chat.py @@ -21,7 +21,7 @@ IMG_PATH = Path(resource_filename("llama_models", "scripts/resources/")) -from models.demos.llama3.tt.multimodal.vision_generator import LlamaVision +from models.demos.llama3.tt.generator import LlamaGenerator from models.demos.llama3.demo.simple_vision_demo import create_multimodal_model @@ -67,7 +67,7 @@ def test_llama_multimodal_demo_chat( model_args, model = create_multimodal_model(mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len) tokenizer = Tokenizer(model_path=tokenizer_path) formatter = ChatFormat(tokenizer) - generator = LlamaVision(model, model_args, mesh_device, tokenizer=tokenizer, formatter=formatter) + generator = LlamaGenerator(model, model_args, mesh_device, tokenizer=tokenizer, formatter=formatter) # image understanding dialogs = [] diff --git a/models/demos/llama3/demo/multimodal_demo_text.py b/models/demos/llama3/demo/multimodal_demo_text.py index 2029c43458b2..4bea26c781b1 100644 --- a/models/demos/llama3/demo/multimodal_demo_text.py +++ b/models/demos/llama3/demo/multimodal_demo_text.py @@ -23,7 +23,7 @@ IMG_PATH = Path(resource_filename("llama_models", "scripts/resources/")) from models.demos.llama3.demo.simple_vision_demo import create_multimodal_model -from models.demos.llama3.tt.multimodal.vision_generator import LlamaVision +from models.demos.llama3.tt.generator import LlamaGenerator @pytest.mark.parametrize( @@ -73,7 +73,7 @@ def test_llama_multimodal_demo_text( model_args, model = create_multimodal_model(mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len) tokenizer = Tokenizer(model_path=tokenizer_path) formatter = ChatFormat(tokenizer) - generator = LlamaVision(model, model_args, mesh_device, tokenizer=tokenizer, formatter=formatter) + generator = LlamaGenerator(model, model_args, mesh_device, tokenizer=tokenizer, formatter=formatter) with open(IMG_PATH / "dog.jpg", "rb") as f: img = PIL_Image.open(f).convert("RGB") diff --git a/models/demos/llama3/demo/simple_vision_demo.py b/models/demos/llama3/demo/simple_vision_demo.py index b4946c3eecf1..cda3c2ed9577 100644 --- a/models/demos/llama3/demo/simple_vision_demo.py +++ b/models/demos/llama3/demo/simple_vision_demo.py @@ -23,10 +23,10 @@ import ttnn import time -from models.demos.llama3.tt.multimodal.vision_generator import LlamaVision +from models.demos.llama3.tt.generator import LlamaGenerator -def get_sampler(temperature, top_p, tokenizer): +def get_batch_sampler(temperature, top_p, tokenizer): def sample(logits): if temperature > 0: probs = torch.softmax(logits[:, -1] / temperature, dim=-1) @@ -34,15 +34,14 @@ def sample(logits): else: next_token = torch.argmax(logits[:, -1], dim=-1) - next_token = next_token.reshape(-1) - token = next_token[0].item() - text = tokenizer.decode(next_token.tolist()) - return token, text + next_tokens = next_token.reshape(-1) + texts = [tokenizer.decode([next_tokens[i].item()]) for i in range(len(next_tokens))] + return next_tokens, texts return sample -def create_multimodal_model(mesh_device, max_batch_size, max_seq_len, dtype=ttnn.bfloat16): +def create_multimodal_model(mesh_device, max_batch_size, max_seq_len, dtype=ttnn.bfloat16, use_paged_kv_cache=False): from models.demos.llama3.tt.multimodal.llama_vision_model import CrossAttentionTransformer from models.demos.llama3.tt.model_config import TtModelArgs @@ -56,6 +55,7 @@ def create_multimodal_model(mesh_device, max_batch_size, max_seq_len, dtype=ttnn weight_cache_path=tt_model_args.weight_cache_path(dtype), dtype=dtype, configuration=tt_model_args, + use_paged_kv_cache=use_paged_kv_cache, ) return tt_model_args, model @@ -70,32 +70,30 @@ def create_multimodal_model(mesh_device, max_batch_size, max_seq_len, dtype=ttnn indirect=True, ) @pytest.mark.parametrize( - "warmup_iters", - (0, 1), - ids=["cold", "warm"], + "test_type,max_seq_len", + (("normal", 512),), + ids=["normal"], ) @pytest.mark.parametrize( - "test_case", + "warmup_iters, enable_trace, max_batch_size", [ - "normal", + (0, False, 1), # batch1-notrace + (0, True, 1), # batch1-trace + (0, True, 32), # batch32-trace ], -) -@pytest.mark.parametrize( - "enable_trace", - (False, True), - ids=["no_trace", "yes_trace"], + ids=["batch1-notrace", "batch1-trace", "batch32-trace"], ) @pytest.mark.parametrize("device_params", [{"trace_region_size": 14951424, "num_command_queues": 2}], indirect=True) def test_llama_multimodal_demo_text( mesh_device, warmup_iters, - test_case, enable_trace, + max_batch_size, + test_type, + max_seq_len, temperature: float = 0, top_p: float = 0.9, - max_seq_len: int = 512, - max_batch_size: int = 1, - max_gen_len: Optional[int] = 200, + max_gen_len: Optional[int] = 500, model_parallel_size: Optional[int] = None, ): """ @@ -107,7 +105,7 @@ def test_llama_multimodal_demo_text( mesh_device.enable_program_cache() mesh_device.enable_async(True) model_args, model = create_multimodal_model(mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len) - generator = LlamaVision(model, model_args, mesh_device) + generator = LlamaGenerator(model, model_args, mesh_device) tokenizer = Tokenizer(model_path=tokenizer_path) formatter = ChatFormat(tokenizer) @@ -132,96 +130,106 @@ def test_llama_multimodal_demo_text( [UserMessage(content=[ImageMedia(image=ocr_image), "What is the full text of this image? Do OCR"])], [UserMessage(content=[ImageMedia(image=clutter), "What objects are in this image?"])], ] + if len(dialogs) < max_batch_size: + dialogs *= max_batch_size // len(dialogs) - sampler = get_sampler(temperature, top_p, tokenizer) - - for iter_num in range(warmup_iters + 1): - for dialog in dialogs: - for msg in dialog: - print(f"{msg.role.capitalize()}: {msg.content}\n") + assert len(dialogs) % max_batch_size == 0 + num_batches = len(dialogs) // max_batch_size - if iter_num <= warmup_iters: - logger.info(f"Warmup iteration {iter_num}") + sampler = get_batch_sampler(temperature, top_p, tokenizer) - model_input = formatter.encode_dialog_prompt(dialog, tool_prompt_format=False) + for iter_num in range(warmup_iters + 1): + logger.info(f"Iteration {iter_num}") + for batch_idx in range(num_batches): + batch_dialogs = dialogs[batch_idx * max_batch_size : (batch_idx + 1) * max_batch_size] + for dialog in batch_dialogs: + for msg in dialog: + print(f"{msg.role.capitalize()}: {msg.content}\n") + batch_model_input = [ + formatter.encode_dialog_prompt(dialog, tool_prompt_format=False) for dialog in batch_dialogs + ] # Do initial prefill - vision_images = model_input.vision.images - vision_mask = model_input.vision.mask - prompt_tokens = model_input.tokens - prefill_len = len(prompt_tokens) - total_len = prefill_len + max_gen_len # Prepares mask for full length of output - # Create tokens tensor + vision_images = [model_input.vision.images for model_input in batch_model_input] + vision_mask = [model_input.vision.mask for model_input in batch_model_input] + prompt_tokens = [model_input.tokens for model_input in batch_model_input] + # Get max length of prompts in batch + prefill_lens = torch.tensor([len(tokens) for tokens in prompt_tokens], dtype=torch.long) + total_lens = prefill_lens + max_gen_len + + # Create padded tokens tensor for batch pad_id = tokenizer.pad_id - bsz = 1 - tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long) - tokens[0, : len(prompt_tokens)] = torch.tensor(prompt_tokens, dtype=torch.long) + bsz = len(prompt_tokens) + tokens = torch.full((bsz, max(total_lens)), pad_id, dtype=torch.long) + + # Fill in actual tokens for each sequence in batch + for i, seq in enumerate(prompt_tokens): + tokens[i, : len(seq)] = torch.tensor(seq, dtype=torch.long) + prefill_start = time.perf_counter() - prompt_tokens_tensor = torch.tensor(prompt_tokens, dtype=torch.long).reshape(1, -1) # B, S - ( - xattn_caches, - cross_attention_masks, - full_text_row_masked_out_mask, - logits, - ) = generator.prefill_forward_single_user( + batch_logits, batch_xattn_masks, batch_text_masks = generator.prefill_forward( vision_images, vision_mask, - prompt_tokens_tensor, + tokens, xattn_caches, - user_id=0, - total_len=total_len, - prefill_len=prefill_len, + total_lens, + prefill_lens, ) - prefill_end = time.perf_counter() - - next_token, text = sampler(logits) - tokens[0, prefill_len] = next_token + prefill_end = time.perf_counter() + next_tokens, next_texts = sampler(batch_logits) + for i, (next_token, next_text) in enumerate(zip(next_tokens, next_texts)): + tokens[i, prefill_lens[i]] = next_token + print(f"Next tokens: {next_tokens}") + print(f"Next texts: {next_texts}") decode_times = [] for gen_idx in range(max_gen_len - 1): decode_start = time.perf_counter() - position_id = prefill_len + gen_idx - next_token_tensor = torch.tensor([next_token], dtype=torch.long).reshape(1, 1) # B, S + position_id = prefill_lens + gen_idx + next_token_tensor = next_tokens.reshape(max_batch_size, 1) if enable_trace: logits = generator.easy_trace( position_id, next_token_tensor, - cross_attention_masks, - full_text_row_masked_out_mask, + batch_xattn_masks, + batch_text_masks, xattn_caches, ) else: logits = generator.decode_forward( position_id, next_token_tensor, - cross_attention_masks, - full_text_row_masked_out_mask, + batch_xattn_masks, + batch_text_masks, xattn_caches, ) - next_token, text = sampler(logits) + next_tokens, next_texts = sampler(logits) # Update next token - tokens[0, position_id + 1] = next_token + tokens[torch.arange(max_batch_size), position_id + 1] = next_tokens decode_end = time.perf_counter() decode_times.append(decode_end - decode_start) - if text in ["<|eot_id|>", "<|eom_id|>"]: - break - - # Log full text output + # Disable checking for eot until I have more robust code for batch > 1 + # if text in ["<|eot_id|>", "<|eom_id|>"]: + # break + # Log full text output for each user in batch vision_tokens = [tokenizer.special_tokens["<|image|>"], 128256] - # Remove <|image|> tokens since they break the tokenizer - tokens_out = [ - t if t not in vision_tokens else tokenizer.pad_id for t in tokens[0].tolist()[: position_id + 2] - ] - text = tokenizer.decode(tokens_out) - logger.info(f"Full text: {text}") + + for user_id in range(max_batch_size): + # Remove <|image|> tokens since they break the tokenizer + tokens_out = [ + t if t not in vision_tokens else tokenizer.pad_id + for t in tokens[user_id].tolist()[: position_id[user_id] + 2] + ] + text = tokenizer.decode(tokens_out) + logger.info(f"User {user_id} full text: {text}") prefill_time_ms = (prefill_end - prefill_start) * 1000 logger.info(f"Prefill time: {prefill_time_ms:.2f} ms") decode_time_ms = sum(decode_times) / (gen_idx + 1) * 1000 - logger.info(f"Decode time: {decode_time_ms:.2f} ms") + logger.info(f"Average decode time per token: {decode_time_ms:.2f} ms") # ttnn.release_trace(generator.mesh_device, trace_id) diff --git a/models/demos/llama3/lt b/models/demos/llama3/lt index 594568609ba1..8f68983a2b6b 100755 --- a/models/demos/llama3/lt +++ b/models/demos/llama3/lt @@ -733,7 +733,7 @@ def run_entry_command(entry, screen_lock, output_entries, screen_needs_update): command_shortcuts = { "demo": "pytest models/demos/llama3/demo/demo.py -k performance-batch-1", "demo-32": "pytest models/demos/llama3/demo/demo.py -k performance-batch-32", - "demo-long": "pytest models/demos/llama3/demo/demo.py -k long", + "demo-long": "pytest models/demos/llama3/demo/demo.py -k performance-long", "attention": "pytest models/demos/llama3/tests/test_llama_attention.py", "attention-prefill": "pytest models/demos/llama3/tests/test_llama_attention_prefill.py", "mlp": "pytest models/demos/llama3/tests/test_llama_mlp.py", diff --git a/models/demos/llama3/tests/multimodal/test_llama_cross_attention.py b/models/demos/llama3/tests/multimodal/test_llama_cross_attention.py index cc34f091e170..462a004b1330 100644 --- a/models/demos/llama3/tests/multimodal/test_llama_cross_attention.py +++ b/models/demos/llama3/tests/multimodal/test_llama_cross_attention.py @@ -34,9 +34,10 @@ ) @pytest.mark.parametrize( "batch", - (1,), + (1, 2), ids=[ "batch_1", + "batch_2", ], ) def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset_seeds, ensure_gc): @@ -46,6 +47,7 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset mesh_device.enable_async(True) model_args = TtModelArgs(mesh_device) + model_args.max_seq_len = text_seq_len state_dict = torch.load(model_args.consolidated_weights_path, map_location=torch.device("cpu")) # Ref model needs partial state dict, but our models use full state dict keys as cached weight names @@ -91,12 +93,15 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset """ pt_xattn_cache = reference_model.compute_xattn_kv_cache(pt_xattn_tokens) pt_xattn_cache_chunks = torch.chunk(pt_xattn_cache, 2, dim=0) - pt_xattn_cache_chunks = [x.view(batch, n_heads, vision_seq_len, head_dim) for x in pt_xattn_cache] + # slice out repeated KV heads + pt_xattn_cache_chunks = [ + x.view(batch, n_heads, vision_seq_len, head_dim)[:, :: n_heads // n_kv_heads] for x in pt_xattn_cache + ] # Preallocate K and V caches tt_xattn_cache = [ ttnn.from_torch( - torch.zeros(batch, n_heads, vision_seq_len, head_dim), + torch.zeros(batch, n_kv_heads, vision_seq_len, head_dim), device=mesh_device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, @@ -109,9 +114,10 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset """ Test forward, prefill and decode! """ - for i in range(10): - seq_len = text_seq_len if i == 0 else 1 + n_iter = 10 + for i in range(n_iter): mode = "prefill" if i == 0 else "decode" + seq_len = text_seq_len if mode == "prefill" else 1 pt_x = (torch.rand(batch, seq_len, dim) * 2) - 1 tt_x = pt_x.clone() @@ -150,18 +156,18 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset if mode == "prefill": outputs = [] for b in range(batch): - tt_tensor_xattn_tokens = model_args.prepare_inputs_ttnn_prefill( + tt_tensor_xattn_tokens = model_args.prepare_residual_tensor_prefill( tt_xattn_tokens[b : b + 1], force_replicated=True, ) - tt_tensor_x = model_args.prepare_inputs_ttnn_prefill( + tt_tensor_x = model_args.prepare_residual_tensor_prefill( tt_x[b : b + 1], force_replicated=True, ) tt_xattn_mask = ttnn.from_torch( - xattn_mask_expand[b : b + 1], + xattn_mask[b : b + 1], device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -169,7 +175,7 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset tt_full_text_mask = ttnn.from_torch( full_text_mask_expand[b : b + 1], device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -190,18 +196,17 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset tt_output_torch = torch.cat(outputs, dim=0).view(batch, seq_len, dim) else: - tt_x = model_args.prepare_inputs_ttnn_decode( + tt_x = model_args.prepare_residual_tensor_decode( tt_x, - ttnn.DRAM_MEMORY_CONFIG, + model_args.model_config["SHARDED_ATTN_INPUT_MEMCFG"], force_replicated=True, ) - tt_x = ttnn.interleaved_to_sharded(tt_x, model_args.model_config["SHARDED_ATTN_INPUT_MEMCFG"]) xattn_mask_expand = xattn_mask_expand.permute(2, 0, 1, 3).contiguous() tt_xattn_mask = ttnn.from_torch( xattn_mask_expand, device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -218,7 +223,7 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset tt_full_text_mask = ttnn.from_torch( full_text_mask_expand, device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -239,7 +244,7 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset mode=mode, ) - tt_output_torch = ttnn.to_torch(tt_out, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=1)) + tt_output_torch = ttnn.to_torch(tt_out, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=-1)) tt_output_torch = tt_output_torch[:, :, :batch, :].reshape(batch, seq_len, dim) passing, pcc_message = comp_pcc(pt_out, tt_output_torch, pcc_required) @@ -251,12 +256,13 @@ def test_llama_cross_attention_inference(text_seq_len, batch, mesh_device, reset tt_xattn_cache_torch = [ ttnn.to_torch(x, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=1)).view( batch, - n_heads, + n_kv_heads, vision_seq_len, head_dim, ) for x in tt_xattn_cache ] + for pt, tt in zip(pt_xattn_cache_chunks, tt_xattn_cache_torch): passing, pcc_message = comp_pcc(pt, tt, pcc_required) diff --git a/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py b/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py index 7448601b8ce4..1d9da2fbcca3 100644 --- a/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py +++ b/models/demos/llama3/tests/multimodal/test_llama_cross_attention_transformer_text.py @@ -75,6 +75,7 @@ def test_llama_cross_attention_transformer_text_inference( dim = model_args.dim head_dim = model_args.head_dim n_heads = model_args.n_heads + n_kv_heads = model_args.n_kv_heads reference_model = llama_reference_mod.CrossAttentionTransformerText(args=model_args) reference_model.setup_cache(model_args.max_batch_size, torch.float32) reference_model.load_state_dict(partial_state_dict) @@ -107,15 +108,18 @@ def test_llama_cross_attention_transformer_text_inference( # unstack k/v pt_xattn_cache_chunks = [torch.chunk(x, 2, dim=1) for x in pt_xattn_cache_chunks] pt_xattn_cache_chunks = [x for xx in pt_xattn_cache_chunks for x in xx] - pt_xattn_cache_chunks = [x.view(batch, n_heads, vision_seq_len, head_dim) for x in pt_xattn_cache_chunks] + pt_xattn_cache_chunks = [ + x.view(batch, n_heads, vision_seq_len, head_dim)[:, :: n_heads // n_kv_heads] for x in pt_xattn_cache_chunks + ] # Iterate over batch # Preallocate K and V caches tt_xattn_cache = tt_model.setup_cache(max_batch_size=batch) # Test forward pass of the model - n_iter = 10 + prev_pos = 0 + n_iter = 10 # tokens = torch.randint(100, 1000, (batch, text_seq_len+n_iter), dtype=torch.long)#, device="cuda" tokens = torch.randint(0, model_args.vocab_size, (batch, text_seq_len + n_iter), dtype=torch.long) for i in range(n_iter): @@ -177,17 +181,17 @@ def test_llama_cross_attention_transformer_text_inference( if mode == "prefill": outputs = [] for b in range(batch): - tt_tensor_vision_tokens = model_args.prepare_inputs_ttnn_prefill( + tt_tensor_vision_tokens = model_args.prepare_residual_tensor_prefill( tt_vision_tokens[b : b + 1], force_replicated=True, ) - tt_h = model_args.prepare_inputs_ttnn_prefill( + tt_h = model_args.prepare_residual_tensor_prefill( h[b : b + 1], ) tt_xattn_mask = ttnn.from_torch( - xattn_mask_expand[b : b + 1], + xattn_mask[b : b + 1], device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -195,7 +199,7 @@ def test_llama_cross_attention_transformer_text_inference( tt_full_text_mask_expand_1NSH = ttnn.from_torch( full_text_mask_expand_1NSH[b : b + 1], device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -203,7 +207,7 @@ def test_llama_cross_attention_transformer_text_inference( tt_full_text_mask_expand_11SD = ttnn.from_torch( full_text_mask_expand_11SD[b : b + 1], device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ShardTensorToMesh(mesh_device, dim=-1), @@ -212,16 +216,6 @@ def test_llama_cross_attention_transformer_text_inference( rot_mats = get_prefill_rot_mat( model_args.head_dim, model_args.max_seq_len, mesh_device, seq_len=seq_len ) - transformation_mat_torch = get_rot_transformation_mat(model_args.head_dim) - transformation_mats = ttnn.as_tensor( - transformation_mat_torch, - dtype=ttnn.bfloat16, - layout=ttnn.TILE_LAYOUT, - device=mesh_device, - mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), - memory_config=ttnn.DRAM_MEMORY_CONFIG, - ) - tt_out = tt_model( tt_h, xattn_mask=tt_xattn_mask, @@ -229,8 +223,7 @@ def test_llama_cross_attention_transformer_text_inference( full_text_row_masked_out_mask_11SD=tt_full_text_mask_expand_11SD, xattn_caches=tt_xattn_cache, current_pos=None, - rot_mat=rot_mats, - transformation_mats=transformation_mats, + rot_mats=rot_mats, user_id=b, mode=mode, text_only_inference=TEXT_ONLY, @@ -245,7 +238,7 @@ def test_llama_cross_attention_transformer_text_inference( pcc_required = prefill_pcc_required else: - tt_h = model_args.prepare_inputs_ttnn_decode( + tt_h = model_args.prepare_residual_tensor_decode( h, model_args.model_config["DECODE_RESIDUAL_MEMCFG"], ) @@ -265,14 +258,14 @@ def test_llama_cross_attention_transformer_text_inference( model_args.num_devices, start_pos=cur_pos - 1, ) - - transformation_mats = None + tt_rope_id = tt_model.rope_setup.get_rot_idxs(position_ids) + rot_mats = tt_model.rope_setup.get_rot_mats(tt_rope_id) xattn_mask_expand = xattn_mask_expand.permute(2, 0, 1, 3).contiguous() tt_xattn_mask = ttnn.from_torch( xattn_mask_expand, device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -289,7 +282,7 @@ def test_llama_cross_attention_transformer_text_inference( tt_full_text_mask_expand_1NSH = ttnn.from_torch( full_text_mask_expand_1NSH, device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -311,8 +304,7 @@ def test_llama_cross_attention_transformer_text_inference( full_text_row_masked_out_mask_11SD=None, xattn_caches=tt_xattn_cache, current_pos=tt_position_id, - rot_mat=rot_mats, - transformation_mats=transformation_mats, + rot_mats=rot_mats, mode=mode, text_only_inference=TEXT_ONLY, ) @@ -332,7 +324,7 @@ def test_llama_cross_attention_transformer_text_inference( tt_xattn_cache_torch = [ ttnn.to_torch(x, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=1)).view( batch, - n_heads, + n_kv_heads, vision_seq_len, head_dim, ) diff --git a/models/demos/llama3/tests/multimodal/test_llama_cross_block.py b/models/demos/llama3/tests/multimodal/test_llama_cross_block.py index 96637e5090c7..3f6a9253e5d2 100644 --- a/models/demos/llama3/tests/multimodal/test_llama_cross_block.py +++ b/models/demos/llama3/tests/multimodal/test_llama_cross_block.py @@ -30,9 +30,10 @@ ) @pytest.mark.parametrize( "batch", - (1,), + (1, 2), ids=[ "batch_1", + "batch_2", ], ) def test_llama_cross_attention_transformer_block_inference( @@ -57,6 +58,7 @@ def test_llama_cross_attention_transformer_block_inference( dim = model_args.dim head_dim = model_args.head_dim n_heads = model_args.n_heads + n_kv_heads = model_args.n_kv_heads reference_model = llama_reference_mod.CrossAttentionTransformerBlock(args=model_args, layer_id=0, no_ffn=False) reference_model.load_state_dict(partial_state_dict) @@ -83,12 +85,14 @@ def test_llama_cross_attention_transformer_block_inference( """ pt_xattn_cache = reference_model.compute_xattn_kv_cache(pt_xattn_tokens) pt_xattn_cache_chunks = torch.chunk(pt_xattn_cache, 2, dim=0) - pt_xattn_cache_chunks = [x.view(batch, n_heads, vision_seq_len, head_dim) for x in pt_xattn_cache] + pt_xattn_cache_chunks = [ + x.view(batch, n_heads, vision_seq_len, head_dim)[:, :: n_heads // n_kv_heads] for x in pt_xattn_cache + ] # Preallocate K and V caches tt_xattn_cache = [ ttnn.from_torch( - torch.zeros(batch, n_heads, vision_seq_len, head_dim), + torch.zeros(batch, n_kv_heads, vision_seq_len, head_dim), device=mesh_device, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, @@ -145,17 +149,17 @@ def test_llama_cross_attention_transformer_block_inference( if mode == "prefill": outputs = [] for b in range(batch): - tt_tensor_xattn_tokens = model_args.prepare_inputs_ttnn_prefill( + tt_tensor_xattn_tokens = model_args.prepare_residual_tensor_prefill( tt_xattn_tokens[b : b + 1], force_replicated=True, ) - tt_tensor_x = model_args.prepare_inputs_ttnn_prefill( + tt_tensor_x = model_args.prepare_residual_tensor_prefill( tt_x[b : b + 1], ) tt_xattn_mask = ttnn.from_torch( - xattn_mask_expand[b : b + 1], + xattn_mask[b : b + 1], device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -163,7 +167,7 @@ def test_llama_cross_attention_transformer_block_inference( tt_full_text_mask_expand_1NSH = ttnn.from_torch( full_text_mask_expand_1NSH[b : b + 1], device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -171,7 +175,7 @@ def test_llama_cross_attention_transformer_block_inference( tt_full_text_mask_expand_11SD = ttnn.from_torch( full_text_mask_expand_11SD[b : b + 1], device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ShardTensorToMesh(mesh_device, dim=-1), @@ -193,15 +197,15 @@ def test_llama_cross_attention_transformer_block_inference( tt_output_torch = torch.cat(outputs, dim=0).view(batch, seq_len, dim) else: - tt_x = model_args.prepare_inputs_ttnn_decode( + tt_x = model_args.prepare_residual_tensor_decode( tt_x, - ttnn.DRAM_MEMORY_CONFIG, + model_args.model_config["DECODE_RESIDUAL_MEMCFG"], ) xattn_mask_expand = xattn_mask_expand.permute(2, 0, 1, 3).contiguous() tt_xattn_mask = ttnn.from_torch( xattn_mask_expand, device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -218,7 +222,7 @@ def test_llama_cross_attention_transformer_block_inference( tt_full_text_mask_expand_1NSH = ttnn.from_torch( full_text_mask_expand_1NSH, device=mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), @@ -252,7 +256,7 @@ def test_llama_cross_attention_transformer_block_inference( tt_xattn_cache_torch = [ ttnn.to_torch(x, mesh_composer=ttnn.ConcatMeshToTensor(mesh_device, dim=1)).view( batch, - n_heads, + n_kv_heads, vision_seq_len, head_dim, ) diff --git a/models/demos/llama3/tests/multimodal/test_llama_image_attention.py b/models/demos/llama3/tests/multimodal/test_llama_image_attention.py index 844937a518bc..3d9e69771457 100644 --- a/models/demos/llama3/tests/multimodal/test_llama_image_attention.py +++ b/models/demos/llama3/tests/multimodal/test_llama_image_attention.py @@ -73,7 +73,7 @@ def test_llama_attention_inference(batch, num_chunks, mesh_device, use_program_c mask = encoder_utils.build_encoder_attention_mask(pt_block_input, ar, ntok, num_chunks, 1) pt_block_input = pt_block_input.reshape(batch, -1, dim) - attention_input = model_args.prepare_inputs_ttnn_prefill( + attention_input = model_args.prepare_residual_tensor_prefill( tt_attention_input.view(num_chunks, ntok, dim), force_replicated=True, ) diff --git a/models/demos/llama3/tests/multimodal/test_llama_image_block.py b/models/demos/llama3/tests/multimodal/test_llama_image_block.py index 001aa5188282..23096202e292 100644 --- a/models/demos/llama3/tests/multimodal/test_llama_image_block.py +++ b/models/demos/llama3/tests/multimodal/test_llama_image_block.py @@ -84,7 +84,7 @@ def test_llama_block_inference(batch, num_chunks, mesh_device, gated, use_progra mask = encoder_utils.build_encoder_attention_mask(pt_block_input, ar, ntok, num_chunks, 1) pt_block_input = pt_block_input.reshape(batch, -1, dim) - attention_input = model_args.prepare_inputs_ttnn_prefill( + attention_input = model_args.prepare_residual_tensor_prefill( tt_attention_input.view(num_chunks, ntok, dim), force_replicated=True, ) diff --git a/models/demos/llama3/tests/multimodal/test_llama_image_transformer.py b/models/demos/llama3/tests/multimodal/test_llama_image_transformer.py index 03f3310a0e32..502736ac7907 100644 --- a/models/demos/llama3/tests/multimodal/test_llama_image_transformer.py +++ b/models/demos/llama3/tests/multimodal/test_llama_image_transformer.py @@ -113,7 +113,7 @@ def test_llama_image_transformer_inference( mask = encoder_utils.build_encoder_attention_mask(pt_block_input, ar, ntok, num_chunks, 1) pt_block_input = pt_block_input.reshape(batch, -1, dim) - attention_input = model_args.prepare_inputs_ttnn_prefill( + attention_input = model_args.prepare_residual_tensor_prefill( tt_attention_input.view(num_chunks, ntok, dim), force_replicated=True, ) diff --git a/models/demos/llama3/tests/multimodal/test_llama_vision_model.py b/models/demos/llama3/tests/multimodal/test_llama_vision_model.py deleted file mode 100644 index f55a47891aca..000000000000 --- a/models/demos/llama3/tests/multimodal/test_llama_vision_model.py +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. - -# SPDX-License-Identifier: Apache-2.0 -from pathlib import Path -from typing import Optional -from loguru import logger - -from PIL import Image as PIL_Image -from termcolor import cprint - -import llama_models.llama3.reference_impl.generation as llama_reference_generation - -from llama_models.llama3.api.datatypes import ImageMedia - -from models.utility_functions import ( - comp_pcc, - comp_allclose, -) - -THIS_DIR = Path(__file__).parent.parent.parent.resolve() / "reference/llama_models/models/scripts/" - -import torch -import pytest -import os -import ttnn - - -def create_multimodal_model(model_args, mesh_device, dtype=ttnn.bfloat16): - from models.demos.llama3.tt.multimodal.llama_vision_model import CrossAttentionTransformer - from models.demos.llama3.tt.model_config import TtModelArgs - - tt_model_args = TtModelArgs(mesh_device) - checkpoint = torch.load(tt_model_args.consolidated_weights_path, map_location="cpu", weights_only=True) - model = CrossAttentionTransformer( - model_args, - mesh_device, - checkpoint, - weight_cache_path=tt_model_args.weight_cache_path(dtype), - dtype=dtype, - configuration=tt_model_args, - ) - model.setup_cache(model_args.max_batch_size, torch.float32) - return model - - -@pytest.mark.parametrize( - "mesh_device", - [ - {"N150": (1, 1), "N300": (1, 2), "T3K": (1, 8), "TG": (8, 4)}.get( - os.environ.get("FAKE_DEVICE"), len(ttnn.get_device_ids()) - ) - ], - indirect=True, -) -def test_llama_vision_model( - mesh_device, - temperature: float = 0, - max_seq_len: int = 512, - max_batch_size: int = 4, - max_gen_len: Optional[int] = 50, - model_parallel_size: Optional[int] = None, -): - """ - This test runs the Llama3.2 vision model on CPU and TT concurrently. - It does not use teacher forcing and compares output logits at every token. - """ - mesh_device.enable_program_cache() - mesh_device.enable_async(True) - ckpt_dir = os.environ["LLAMA_DIR"] - tokenizer_path = str(Path(ckpt_dir) / "tokenizer.model") - - logger.info(f"Creating reference model from checkpoint in '{ckpt_dir}'") - generator_pt = llama_reference_generation.Llama.build( - ckpt_dir, - tokenizer_path=tokenizer_path, - max_seq_len=max_seq_len, - max_batch_size=max_batch_size, - model_parallel_size=model_parallel_size, - ) - - generator_tt = llama_reference_generation.Llama(generator_pt.model, generator_pt.tokenizer, generator_pt.args) - logger.info(f"Creating TT model on {len(mesh_device.get_devices())} devices") - model = create_multimodal_model(generator_tt.args, mesh_device) - generator_tt.model = model - - # with open(THIS_DIR / "resources/dog.jpg", "rb") as f: - # img = PIL_Image.open(f).convert("RGB") - - # with open(THIS_DIR / "resources/pasta.jpeg", "rb") as f: - # img2 = PIL_Image.open(f).convert("RGB") - - with open(THIS_DIR / "resources/ocr_image.jpeg", "rb") as f: - ocr_image = PIL_Image.open(f).convert("RGB") - - # with open(THIS_DIR / "resources/clutter.jpeg", "rb") as f: - # clutter = PIL_Image.open(f).convert("RGB") - - interleaved_contents = [ - # text only - # "The color of the sky is blue but sometimes it can also be", - # image understanding - # [ImageMedia(image=img), "If I had to write a haiku for this one"], - # [ImageMedia(image=img2), "Couting the number of individual spaghetti strands in this image"], - [ImageMedia(image=ocr_image), "The full text in this image is as follows"], - # [ImageMedia(image=clutter), "The count of vases, books, and miscellaneous items in this image is"], - ] - - for content in interleaved_contents: - logger.info(f"Generating text for content: {content}") - model_input = generator_pt.formatter.encode_content(content) - gen_pt = generator_pt.generate( - model_input, max_gen_len=max_gen_len, temperature=temperature, return_logits=True - ) - gen_tt = generator_tt.generate( - model_input, max_gen_len=max_gen_len, temperature=temperature, return_logits=True - ) - - for out_idx, (token_pt, token_tt) in enumerate(zip(gen_pt, gen_tt)): - logger.info(f"Comparing output token {out_idx}") - out_pt, out_tt = token_pt[1], token_tt[1] - out_pt = out_pt[0, -1] - out_tt = out_tt[0, -1] - passing, pcc_message = comp_pcc(out_pt, out_tt, 0.90) - print(f"PCC: {pcc_message}") - # Check shapes of logprobs - - ref_argmax = torch.argmax(out_pt).item() - ref_logprob = out_pt[ref_argmax].item() - ref_token = generator_pt.tokenizer.decode([ref_argmax]) - - # Reference model: top-5 tokens - ref_top5_vals, ref_top5_idxs = torch.topk(out_pt, 5) - ref_top5_tokens = [generator_pt.tokenizer.decode([idx.item()]) for idx in ref_top5_idxs] - ref_top5_logprobs = ref_top5_vals.tolist() - - # Test model: top-5 tokens - top5_vals, top5_idxs = torch.topk(out_tt, 5) - top5_tokens = [generator_pt.tokenizer.decode([idx.item()]) for idx in top5_idxs] - top5_logprobs = top5_vals.tolist() - - def entropy(logits): - probs = torch.softmax(logits, dim=-1) - return -(probs * torch.log(probs)).sum().item() - - # Print the information - print(f"Token Position {out_idx}:") - print(f" Reference | Test") - print(f" Entropy: {entropy(out_pt):.4f} | {entropy(out_tt):.4f}") - print(f" Top-5 Tokens:") - for rank in range(5): - print( - f" {rank+1}. Token='{ref_top5_tokens[rank]}' @ {ref_top5_logprobs[rank]:.2f} | '{top5_tokens[rank]}' @ {top5_logprobs[rank]:.2f}" - ) - print() diff --git a/models/demos/llama3/tests/test_interleaved_to_sharded.py b/models/demos/llama3/tests/test_interleaved_to_sharded.py index b69d7d2459b4..9edc9a89dd03 100644 --- a/models/demos/llama3/tests/test_interleaved_to_sharded.py +++ b/models/demos/llama3/tests/test_interleaved_to_sharded.py @@ -80,7 +80,7 @@ def test_llama_decoder_inference(mesh_device, use_program_cache, reset_seeds): mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), ) - decode_input = model_args.prepare_inputs_ttnn_decode( + decode_input = model_args.prepare_residual_tensor_decode( tt_decode_input, ttnn.L1_MEMORY_CONFIG, ) diff --git a/models/demos/llama3/tests/test_llama_accuracy.py b/models/demos/llama3/tests/test_llama_accuracy.py index 2ae973a907d7..b19cb086066d 100644 --- a/models/demos/llama3/tests/test_llama_accuracy.py +++ b/models/demos/llama3/tests/test_llama_accuracy.py @@ -9,13 +9,11 @@ import ttnn from models.demos.llama3.tt.llama_common import ( get_prefill_rot_mat, - get_rot_transformation_mat, HostEmbedding, PagedAttentionConfig, ) from models.demos.llama3.tt.llama_model import TtTransformer from models.demos.llama3.tt.model_config import TtModelArgs, LlamaOptimizations -from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from models.demos.llama3.demo.demo import preprocess_inputs_prefill from pathlib import Path @@ -141,28 +139,6 @@ def test_tt_model_accuracy( N = prefill_len + decode_len input_ids = reference_tokens[:, : N + 1] # Shape [1, N+1] - # Setup RoPE transformation matrices - rope_setup = TtLlamaRotarySetup( - mesh_device, - model_args.max_batch_size, - model_args.head_dim, - model_args.max_seq_len, - model_args.rope_theta, - model_args.use_scaled_rope, - ) - transformation_mats_decode = rope_setup.get_trans_mats() - - transformation_mats_prefill_torch = get_rot_transformation_mat(model_args.head_dim) - transformation_mats_prefill = ttnn.from_torch( - transformation_mats_prefill_torch, - dtype=ttnn.bfloat16, - layout=ttnn.TILE_LAYOUT, - device=mesh_device, - memory_config=ttnn.DRAM_MEMORY_CONFIG, - mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), - ) - transformation_mats = {"decode": transformation_mats_decode, "prefill": transformation_mats_prefill} - page_table_tt = None paged_attention_config = None @@ -193,7 +169,6 @@ def test_tt_model_accuracy( dtype=dtype, state_dict=state_dict, weight_cache_path=model_args.weight_cache_path(dtype), - transformation_mats=transformation_mats, paged_attention_config=paged_attention_config, ) # Initialize embedding @@ -226,7 +201,7 @@ def test_tt_model_accuracy( model_args.head_dim, model_args.max_seq_len, mesh_device, seq_len=prefill_lens[0] ) - prefill_input = model_args.prepare_inputs_ttnn_prefill( + prefill_input = model_args.prepare_residual_tensor_prefill( pt_prefill_input[batch_id], ) @@ -256,7 +231,7 @@ def test_tt_model_accuracy( ) # Get cos/sin matrices for the current position of each user - rot_mats = rope_setup.get_rot_mats(current_pos) + rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) # Print table header logger.info(f"{'Progress':<15}{'Correct':<8}{'True':<15}{'Actual':<15}{'Top 5 Predictions':<75}") @@ -276,7 +251,7 @@ def test_tt_model_accuracy( # Get embedding pt_decode_input = embd(ref_token).view(1, 1, -1) # Prepare input for TT model - decode_input = model_args.prepare_inputs_ttnn_decode( + decode_input = model_args.prepare_residual_tensor_decode( pt_decode_input, model_args.model_config["DECODE_RESIDUAL_MEMCFG"], ) @@ -309,7 +284,7 @@ def test_tt_model_accuracy( # Update rot_mats for next iteration current_pos += 1 - rot_mats = rope_setup.get_rot_mats(current_pos) + rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) # Get reference top5 tokens and probabilities for this position ref_top5_tokens = top5_tokens[prefill_len + i] diff --git a/models/demos/llama3/tests/test_llama_attention.py b/models/demos/llama3/tests/test_llama_attention.py index 8690b91d3b9a..edb9ac99a43b 100644 --- a/models/demos/llama3/tests/test_llama_attention.py +++ b/models/demos/llama3/tests/test_llama_attention.py @@ -100,8 +100,7 @@ def test_llama_attention_inference( model_args.use_scaled_rope, ) - transformation_mats = rope_setup.get_trans_mats() - transformation_mats = {"decode": transformation_mats} + transformation_mats = rope_setup.get_both_trans_mats() page_table_tt = None paged_attention_config = None @@ -158,7 +157,7 @@ def test_llama_attention_inference( tt_attention_input = pt_attention_input.clone() - attention_input = model_args.prepare_inputs_ttnn_decode( + attention_input = model_args.prepare_residual_tensor_decode( tt_attention_input, model_args.model_config["SHARDED_ATTN_INPUT_MEMCFG"], force_replicated=True, diff --git a/models/demos/llama3/tests/test_llama_attention_prefill.py b/models/demos/llama3/tests/test_llama_attention_prefill.py index ef33adc44819..4335bdb4ee1a 100644 --- a/models/demos/llama3/tests/test_llama_attention_prefill.py +++ b/models/demos/llama3/tests/test_llama_attention_prefill.py @@ -141,7 +141,7 @@ def test_llama_attention_inference( pt_attention_input = (torch.rand(batch_size, max_seq_len, model_args.dim) * 2) - 1 tt_attention_input = pt_attention_input.clone() - attention_input = model_args.prepare_inputs_ttnn_prefill( + attention_input = model_args.prepare_residual_tensor_prefill( tt_attention_input, force_replicated=True, ) diff --git a/models/demos/llama3/tests/test_llama_decoder.py b/models/demos/llama3/tests/test_llama_decoder.py index 5d24d3b42989..316c811aaf3e 100644 --- a/models/demos/llama3/tests/test_llama_decoder.py +++ b/models/demos/llama3/tests/test_llama_decoder.py @@ -94,8 +94,7 @@ def test_llama_decoder_inference( model_args.rope_theta, model_args.use_scaled_rope, ) - transformation_mats = rope_setup.get_trans_mats() - transformation_mats = {"decode": transformation_mats} + transformation_mats = rope_setup.get_both_trans_mats() # Prepare page table for paged attention page_table_tt = None @@ -155,7 +154,7 @@ def test_llama_decoder_inference( pt_decode_input = (torch.rand(batch_size, seqlen, model_args.dim) * 2) - 1 tt_decode_input = pt_decode_input.clone() - decode_input = model_args.prepare_inputs_ttnn_decode( + decode_input = model_args.prepare_residual_tensor_decode( tt_decode_input, # ttnn.DRAM_MEMORY_CONFIG, model_args.model_config["DECODE_RESIDUAL_MEMCFG"], diff --git a/models/demos/llama3/tests/test_llama_decoder_prefill.py b/models/demos/llama3/tests/test_llama_decoder_prefill.py index 0c40e21b773e..622e67f91b41 100644 --- a/models/demos/llama3/tests/test_llama_decoder_prefill.py +++ b/models/demos/llama3/tests/test_llama_decoder_prefill.py @@ -142,7 +142,7 @@ def test_llama_decoder_inference( logger.info(f"[Decoder] Generating token {i}") pt_decode_input = (torch.rand(batch_size, max_seq_len, model_args.dim) * 2) - 1 tt_decode_input = pt_decode_input.clone() - decode_input = model_args.prepare_inputs_ttnn_prefill( + decode_input = model_args.prepare_residual_tensor_prefill( tt_decode_input, ) positions = torch.LongTensor(range(max_seq_len)) diff --git a/models/demos/llama3/tests/test_llama_model.py b/models/demos/llama3/tests/test_llama_model.py index cd425579a23e..37e0e4384192 100644 --- a/models/demos/llama3/tests/test_llama_model.py +++ b/models/demos/llama3/tests/test_llama_model.py @@ -15,7 +15,6 @@ ) from models.demos.llama3.tt.model_config import TtModelArgs, LlamaOptimizations from models.demos.llama3.tt.llama_model import TtTransformer -from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.model import Transformer from models.demos.t3000.llama2_70b.reference.llama.llama31_8b.tokenizer import Tokenizer from models.utility_functions import ( @@ -191,18 +190,6 @@ def test_llama_model_inference( generation_start_pos = 0 generation_length = iterations - # Setup RoPE transformation matrices - rope_setup = TtLlamaRotarySetup( - mesh_device, - model_args.max_batch_size, - model_args.head_dim, - model_args.max_seq_len, - model_args.rope_theta, - model_args.use_scaled_rope, - ) - transformation_mats = rope_setup.get_trans_mats() - transformation_mats = {"decode": transformation_mats} - page_table_tt = None paged_attention_config = None @@ -234,7 +221,6 @@ def test_llama_model_inference( dtype=dtype, state_dict=state_dict, weight_cache_path=model_args.weight_cache_path(dtype), - transformation_mats=transformation_mats, paged_attention_config=paged_attention_config, ) logger.info("Model and caches loaded.") @@ -269,13 +255,13 @@ def test_llama_model_inference( for i in range(generation_length): logger.info(f"[Llama3 Model] Generating token {i}") - decode_input = model_args.prepare_inputs_ttnn_decode( + decode_input = model_args.prepare_residual_tensor_decode( tt_decode_input, model_args.model_config["DECODE_RESIDUAL_MEMCFG"], ) # Get cos/sin matrices for the current position of each user - rot_mats = rope_setup.get_rot_mats(current_pos) + rot_mats = tt_model.rope_setup.get_rot_mats(current_pos) # Run TT model tt_out = tt_model( diff --git a/models/demos/llama3/tests/test_llama_model_prefill.py b/models/demos/llama3/tests/test_llama_model_prefill.py index 934c91d5746c..e30c25cc8f47 100644 --- a/models/demos/llama3/tests/test_llama_model_prefill.py +++ b/models/demos/llama3/tests/test_llama_model_prefill.py @@ -93,7 +93,7 @@ def test_llama_model_inference( pcc = 0.91 # TODO Look on improving PCC else: # performance mode assert optimizations == LlamaOptimizations.performance - pcc = 0.87 # TODO Look on improving PCC + pcc = 0.869 # TODO Look on improving PCC mesh_device.enable_async(True) @@ -143,17 +143,6 @@ def test_llama_model_inference( # pre-compute the rotational embedding matrix and send to device rot_mats = get_prefill_rot_mat(model_args.head_dim, model_args.max_seq_len, mesh_device, seq_len=seq_len) - transformation_mat_torch = get_rot_transformation_mat(model_args.head_dim) - transformation_mats_prefill = ttnn.as_tensor( - transformation_mat_torch, - dtype=ttnn.bfloat16, - layout=ttnn.TILE_LAYOUT, - device=mesh_device, - memory_config=ttnn.DRAM_MEMORY_CONFIG, - mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), - ) - transformation_mats = {"prefill": transformation_mats_prefill} - # Setup page table page_table_tt = None paged_attention_config = None @@ -185,7 +174,6 @@ def test_llama_model_inference( dtype=dtype, state_dict=state_dict, weight_cache_path=model_args.weight_cache_path(dtype), - transformation_mats=transformation_mats, paged_attention_config=paged_attention_config, ) @@ -200,7 +188,7 @@ def test_llama_model_inference( tt_prefill_input = pt_prefill_input - tt_prefill_input = model_args.prepare_inputs_ttnn_prefill( + tt_prefill_input = model_args.prepare_residual_tensor_prefill( pt_prefill_input, ) for i in range(1): diff --git a/models/demos/llama3/tt/multimodal/vision_generator.py b/models/demos/llama3/tt/generator.py similarity index 59% rename from models/demos/llama3/tt/multimodal/vision_generator.py rename to models/demos/llama3/tt/generator.py index b00fbf3ff739..c42450e48d3f 100644 --- a/models/demos/llama3/tt/multimodal/vision_generator.py +++ b/models/demos/llama3/tt/generator.py @@ -1,8 +1,10 @@ # SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. # SPDX-License-Identifier: Apache-2.0 + import ttnn import torch +from loguru import logger from llama_models.llama3.api.datatypes import ( InterleavedTextMedia, @@ -15,10 +17,16 @@ TokenResult, sample_top_p, ) +from models.demos.llama3.tt.llama_common import ( + copy_host_to_device, + get_padded_prefill_len, + num_blocks_in_seq, + get_block_size, +) -class LlamaVision: - def __init__(self, model, model_args, mesh_device, vllm=False, tokenizer=None, formatter=None): +class LlamaGenerator: + def __init__(self, model, model_args, mesh_device, tokenizer=None, formatter=None): """ Creating a LlamaVision wrapper requires only a mesh_device and model_args. With model_args you have the checkpoint location, can specify max batch size @@ -32,10 +40,133 @@ def __init__(self, model, model_args, mesh_device, vllm=False, tokenizer=None, f self.model = model self.model_args = model_args self.mesh_device = mesh_device - self.vllm = vllm self.tokenizer = tokenizer self.formatter = formatter + def prefill_forward_text(self, tokens: torch.Tensor, page_table=None, kv_cache=None, prompt_lens=None): + batch, batch_seq_len = tokens.shape + output_logits = torch.zeros(batch, 1, self.model_args.vocab_size) + prompt_lens = prompt_lens if prompt_lens is not None else torch.tensor([batch_seq_len] * batch) + + if page_table is not None: + assert isinstance( + page_table, torch.Tensor + ), "page_table must be a torch.Tensor when passing into prefill_forward" + + for user_id in range(batch): + seq_len = prompt_lens[user_id] + last_token_idx = seq_len - 1 + + prefill_seq_len = get_padded_prefill_len(seq_len) + prefill_ids = torch.cat( + [tokens[user_id : user_id + 1, :seq_len], torch.zeros(1, prefill_seq_len - seq_len).long()], dim=-1 + ) + if page_table is not None: + page_table_user = self._get_prefill_user_page_table(page_table, kv_cache, seq_len) + + logits = self.prefill_forward_single_user_text( + prefill_ids, + page_table=page_table_user if page_table is not None else None, + user_id=user_id, + last_token_idx=last_token_idx, + kv_cache=kv_cache, + ) + + # Since we give unpadded_seq_len, only the tile containing the last token is returned + output_logits[user_id] = logits + + return output_logits + + def prefill_forward_single_user_text(self, tokens, page_table, user_id, last_token_idx, kv_cache=None): + prefill_input, rot_mats_prefill, page_table_tt = self.model.prepare_inputs_prefill( + tokens, + page_table=page_table, + ) + + tt_logits = self.model.ttnn_prefill_forward( + prefill_input, + rot_mats=rot_mats_prefill, + user_id=user_id, + page_table=page_table_tt, + get_last_token=(last_token_idx // 32) * 32, + ) + + logits = self.model.process_output_prefill(tt_logits, last_token_idx=(last_token_idx % 32)) + + return logits + + def decode_forward_text( + self, + tokens, + current_pos, + page_table=None, + ): + """ + Performs text decode step. + Returns logits + """ + tt_tokens, tt_current_pos, tt_rot_mats, tt_page_table = self.model.prepare_inputs_decode( + tokens, current_pos, page_table + ) + + tt_logits = self.model.ttnn_decode_forward( + tt_tokens, + tt_current_pos, + rot_mats=tt_rot_mats, + page_table=tt_page_table, + ) + + logits = self.model.process_output_decode(tt_logits) + return logits + + def capture_trace_text( + self, + tokens, + current_pos, + page_table=None, + ): + """ + Captures a trace for the decode_forward method. + """ + + # Compile run + self.decode_forward_text(tokens, current_pos, page_table) + + # Get inputs ready for trace run + host_inputs = self.model.prepare_decode_inputs_host(tokens, current_pos, page_table) + + device_inputs = copy_host_to_device(host_inputs, mesh_device=self.mesh_device) + + trace_id = ttnn.begin_trace_capture(self.mesh_device, cq_id=0) + transformed_inputs = self.model.transform_decode_inputs_device(*device_inputs) + tt_out_trace = self.model.ttnn_decode_forward(*transformed_inputs) + + ttnn.end_trace_capture(self.mesh_device, trace_id, cq_id=0) + + return trace_id, tt_out_trace, *device_inputs + + def decode_forward_trace_text( + self, + trace_id, + device_inputs, + tt_out_trace, + tokens, + current_pos, + page_table=None, + ): + host_inputs = self.model.prepare_decode_inputs_host(tokens, current_pos, page_table) + + device_inputs = copy_host_to_device( + host_tensors=host_inputs, + device_tensors=device_inputs, + ) + + ttnn.execute_trace(self.mesh_device, trace_id, cq_id=0, blocking=False) + + logits = self.model.process_output_decode(tt_out_trace) + + return logits + def prefill_forward_single_user( self, vision_images, @@ -45,28 +176,37 @@ def prefill_forward_single_user( user_id, total_len, prefill_len, + page_table=None, + kv_cache=None, ): """ Performs vision encode step then text prefill. Returns (xattn_caches, cross_attention_masks, full_text_row_masked_out_mask, logits) """ B = tokens.shape[0] + last_token_idx = prefill_len - 1 vision_tokens, cross_attention_masks, full_text_row_masked_out_mask = self.model.compute_vision_tokens_masks( batch_images=[vision_images], batch_masks=[vision_mask], total_len=total_len, ) + if page_table is not None: + page_table = self._get_prefill_user_page_table(page_table, kv_cache, prefill_len) + ( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_full_text_mask_expand_11SD, - tt_position_id, rot_mats, - transformation_mats, + tt_page_table, ) = self.model.prepare_inputs_prefill( - tokens, cross_attention_masks, full_text_row_masked_out_mask, prefill_len=prefill_len + tokens, + cross_attention_masks, + full_text_row_masked_out_mask, + prefill_len=prefill_len, + page_table=page_table, ) tt_logits = self.model.ttnn_prefill_forward( @@ -75,24 +215,76 @@ def prefill_forward_single_user( tt_full_text_mask_expand_1NSH, tt_full_text_mask_expand_11SD, xattn_caches, - tt_position_id, rot_mats, - transformation_mats, user_id, vision_tokens, + page_table=tt_page_table, + kv_cache=kv_cache, + get_last_token=(last_token_idx // 32) * 32, ) - logits = self.model.process_output_prefill(tt_logits, B, prefill_len) + del tt_page_table + + logits = self.model.process_output_prefill(tt_logits, B, last_token_idx=(last_token_idx % 32)) return xattn_caches, cross_attention_masks, full_text_row_masked_out_mask, logits + def prefill_forward( + self, + vision_images, + vision_masks, + tokens: torch.Tensor, + xattn_caches, + total_lens, + prompt_lens, + page_table=None, + kv_cache=None, + ): + """ + Batched version of prefill_forward_single_user for vision model. + """ + batch, batch_seq_len = tokens.shape + output_logits = torch.zeros(batch, 1, self.model_args.vocab_size) + output_xattn_masks = [] + output_full_text_row_masked_out_masks = [] + + for user_id in range(batch): + print(f"Prefilling User {user_id}") + seq_len = prompt_lens[user_id] + ( + xattn_caches, + cross_attention_masks, + full_text_row_masked_out_mask, + logits, + ) = self.prefill_forward_single_user( + vision_images=vision_images[user_id], + vision_mask=vision_masks[user_id], + tokens=tokens[user_id : user_id + 1, :seq_len], # Keep batch dimension + xattn_caches=xattn_caches, + user_id=user_id, + total_len=total_lens[user_id], + prefill_len=seq_len, + page_table=page_table, + kv_cache=kv_cache, + ) + output_logits[user_id] = logits + output_xattn_masks.append(cross_attention_masks) + output_full_text_row_masked_out_masks.append(full_text_row_masked_out_mask) + + logger.info(f"Finished prefill for all users up to {batch_seq_len} tokens, Starting decode...") + + return output_logits, output_xattn_masks, output_full_text_row_masked_out_masks + def decode_forward( self, - position_id, + start_pos, tokens, cross_attention_masks, full_text_row_masked_out_mask, xattn_caches, + page_table=None, + kv_cache=None, + prompt_lens=None, ): """ Performs text decode step. @@ -101,19 +293,18 @@ def decode_forward( # forward_decode should be traced callable # decorator does compilation, capture, execute - # B = 1 # TODO: Only supports batch=1 right now! Might make tokens input a tensor. B, S = tokens.shape + assert S == 1 ( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, - _, tt_position_id, - rot_mats, - _, + tt_rot_mats, + tt_page_table, ) = self.model.prepare_inputs_decode( - tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id=position_id + tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id=start_pos, page_table=page_table ) tt_logits = self.model.ttnn_decode_forward( @@ -122,7 +313,9 @@ def decode_forward( tt_full_text_mask_expand_1NSH, xattn_caches, tt_position_id, - rot_mats, + tt_rot_mats, + page_table=tt_page_table, + kv_cache=kv_cache, ) logits = self.model.process_output_decode(tt_logits, B, S) @@ -143,10 +336,9 @@ def capture_trace( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, - _, tt_position_id, - rot_mats, - _, + tt_rot_mats, + tt_page_table, ) = self.model.prepare_inputs_decode( tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id=position_id ) @@ -158,7 +350,7 @@ def capture_trace( tt_full_text_mask_expand_1NSH, xattn_caches, tt_position_id, - rot_mats, + tt_rot_mats, ) # Get inputs ready for trace run @@ -166,9 +358,8 @@ def capture_trace( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, - _, tt_position_id, - rot_mats, + tt_rope_id, _, ) = self.model.prepare_decode_inputs_host( tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id @@ -179,9 +370,10 @@ def capture_trace( tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, - rot_mats, - ) = self.model.copy_host_to_device( - (tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, rot_mats) + tt_rope_id, + ) = copy_host_to_device( + (tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, tt_rope_id), + mesh_device=self.mesh_device, ) trace_id = ttnn.begin_trace_capture(self.mesh_device, cq_id=0) @@ -189,36 +381,30 @@ def capture_trace( B = tokens.shape[0] # Do on-device transformations of inputs before forward ( - tt_h, + tt_h_transform, + tt_rot_mats, tt_xattn_mask_transform, tt_full_text_mask_expand_1NSH_transform, ) = self.model.transform_decode_inputs_device( tt_h, + tt_rope_id, tt_xattn_mask, tt_full_text_mask_expand_1NSH, B=B, ) tt_logits_rm = self.model.ttnn_decode_forward( - tt_h, + tt_h_transform, tt_xattn_mask_transform, tt_full_text_mask_expand_1NSH_transform, xattn_caches, tt_position_id, - rot_mats, + tt_rot_mats, ) ttnn.end_trace_capture(self.mesh_device, trace_id, cq_id=0) - return ( - trace_id, - tt_logits_rm, - tt_h_trace_input, - tt_xattn_mask, - tt_full_text_mask_expand_1NSH, - tt_position_id, - rot_mats, - ) + return trace_id, tt_logits_rm, tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, tt_rope_id def decode_forward_trace( self, @@ -233,28 +419,27 @@ def decode_forward_trace( trace_xattn_mask, trace_full_text_mask_expand_1NSH, trace_position_id, - trace_rot_mats, + trace_rope_id, ): ( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, - _, tt_position_id, - rot_mats, + tt_rope_id, _, ) = self.model.prepare_decode_inputs_host( tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id=position_id ) - self.model.copy_host_to_device( - host_tensors=(tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, rot_mats), + copy_host_to_device( + host_tensors=(tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, tt_rope_id), device_tensors=( trace_h, trace_xattn_mask, trace_full_text_mask_expand_1NSH, trace_position_id, - trace_rot_mats, + trace_rope_id, ), ) @@ -284,7 +469,7 @@ def easy_trace( tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, - rot_mats, + tt_rope_id, ) = self.capture_trace( position_id, tokens, @@ -298,7 +483,7 @@ def easy_trace( "tt_xattn_mask": tt_xattn_mask, "tt_full_text_mask_expand_1NSH": tt_full_text_mask_expand_1NSH, "tt_position_id": tt_position_id, - "rot_mats": rot_mats, + "tt_rope_id": tt_rope_id, } self.trace_outputs = { "tt_logits_rm": tt_logits_rm, @@ -316,7 +501,7 @@ def easy_trace( self.trace_inputs["tt_xattn_mask"], self.trace_inputs["tt_full_text_mask_expand_1NSH"], self.trace_inputs["tt_position_id"], - self.trace_inputs["rot_mats"], + self.trace_inputs["tt_rope_id"], ) def generate( @@ -351,6 +536,8 @@ def generate( prefill_len=prefill_len, ) + logits = logits.view(1, 1, self.model_args.max_vocab_size) + def sample(logits): if temperature > 0: probs = torch.softmax(logits[:, -1] / temperature, dim=-1) @@ -368,14 +555,14 @@ def sample(logits): ) for gen_idx in range(max_gen_len - 1): - position_id = prefill_len + gen_idx + position_id = torch.tensor([prefill_len + gen_idx]) next_token_tensor = next_token.reshape(1, 1) # B, S logits = self.decode_forward( position_id, next_token_tensor, - cross_attention_masks, - full_text_row_masked_out_mask, + [cross_attention_masks], + [full_text_row_masked_out_mask], xattn_caches, ) @@ -442,3 +629,9 @@ def text_completion( generation = self.tokenizer.decode(tokens) return CompletionPrediction(generation=generation) + + def _get_prefill_user_page_table(self, page_table, kv_cache, prefill_len): + # Ensure page_table is not padded with extra blocks for paged_fill_cache to work properly + block_size = get_block_size(kv_cache) + num_blocks = num_blocks_in_seq(prefill_len, block_size) + return page_table[:, :num_blocks] diff --git a/models/demos/llama3/tt/generator_vllm.py b/models/demos/llama3/tt/generator_vllm.py new file mode 100644 index 000000000000..f962b2801b1a --- /dev/null +++ b/models/demos/llama3/tt/generator_vllm.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import List, Union +import torch +import PIL +from llama_models.llama3.api.chat_format import create_vision_mask + +from models.demos.llama3.tt.generator import LlamaGenerator +from models.demos.llama3.demo.simple_vision_demo import create_multimodal_model + +from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, EncoderDecoderInputs, InputContext + + +def input_processor_for_mllama(ctx: InputContext, inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs]): + """ + Based on vllm.model_executor.models.mllama.py::input_processor_for_mllama(). + Note that vLLM's input_processor_for_mllama performs additional processing to handle chunking which we do not yet support. + """ + + # Move encoder_prompt to prompt. If the user does not explicitly provide separate + # encoder and decoder prompts, vLLM by default will treat the prompt as the encoder prompt. + # For the block manager to allocate enough blocks and add them to the block table, the decoder prompt + # must contain the full text prompt. + if inputs.get("prompt") is None: + inputs["prompt"] = inputs["encoder_prompt"] + inputs["prompt_token_ids"] = inputs["encoder_prompt_token_ids"] + + return inputs + + +@INPUT_REGISTRY.register_input_processor(input_processor_for_mllama) +class TtMllamaForConditionalGeneration(LlamaGenerator): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.MLLAMA_IMAGE_TOKEN_ID = 128256 + self.max_gen_len = self.model_args.max_seq_len - 1 # TODO: double check what this should be + + @classmethod + def initialize_vllm_model(cls, hf_config, mesh_device, max_batch_size): + max_seq_len = 512 # TODO: Increase to 131072 once it's verified to work + model_args, model = create_multimodal_model(mesh_device, max_batch_size, max_seq_len, use_paged_kv_cache=True) + return cls(model, model_args, mesh_device) + + @property + def cache_path(self): + return self.model_args.model_cache_path + + def prefill_forward( + self, + tokens: torch.Tensor, + images: List[PIL.Image.Image], + xattn_caches, + start_pos, + page_table: torch.Tensor = None, + kv_cache=None, + prompt_lens=None, + ): + """ + Replaces prefill_forward from LlamaGenerator with a version that supports mask creation. + """ + batch = tokens.shape[0] + + vision_images = [] + vision_masks = [] + total_lens = [] + for user_id in range(batch): + vision_images.append([images[user_id]]) + prompt_tokens = [int(tokens[user_id, i]) for i in range(prompt_lens[user_id])] + vision_masks.append(create_vision_mask(prompt_tokens, self.MLLAMA_IMAGE_TOKEN_ID)) + total_lens.append(prompt_lens[user_id] + self.max_gen_len) + + return super().prefill_forward( + vision_images, vision_masks, tokens, xattn_caches, total_lens, prompt_lens, page_table, kv_cache + ) diff --git a/models/demos/llama3/tt/llama_attention.py b/models/demos/llama3/tt/llama_attention.py index a925044554ac..b93438e469de 100644 --- a/models/demos/llama3/tt/llama_attention.py +++ b/models/demos/llama3/tt/llama_attention.py @@ -23,6 +23,7 @@ def __init__( transformation_mats, configuration, paged_attention_config=None, + use_paged_kv_cache=False, ): super().__init__() @@ -56,6 +57,7 @@ def __init__( self.ccl_topology = configuration.ccl_topology() self.is_multichip = configuration.is_multichip + self.layer_num = layer_num layer_name = configuration.get_state_dict_prefix(self.__class__.__name__, layer_num) if configuration.dummy_weights or (weight_cache_path is None): cache_name = lambda _: None @@ -144,6 +146,17 @@ def __init__( cache_file_name=cache_name("wo_height_sharded"), ) + if not use_paged_kv_cache: + # vLLM provides its own kv cache + self.init_kv_cache(configuration, weight_cache_path) + + self.scale = self.head_dim**-0.5 + + def init_kv_cache(self, configuration, weight_cache_path): + """ + Generates empty KV cache and pushed to device memory + """ + if self.paged_attention_config: cache_k = torch.zeros( ( @@ -194,14 +207,13 @@ def __init__( for k_or_v in [cache_k, cache_v] ] - self.scale = self.head_dim**-0.5 - def forward_decode( self, x: ttnn.Tensor, current_pos, rot_mats=None, page_table=None, + kv_cache=None, ) -> ttnn.Tensor: """ x: (seq_len, 1, batch, dim) @@ -262,8 +274,12 @@ def forward_decode( ### # KV update ### - keys = self.layer_past[0] - values = self.layer_past[1] + if kv_cache: + keys = kv_cache[self.layer_num][0] + values = kv_cache[self.layer_num][1] + else: + keys = self.layer_past[0] + values = self.layer_past[1] # k_heads, [seqlen, n_kv_heads, bsz, head_dim] # v_heads [seqlen, n_kv_heads, bsz, head_dim] # keys, [max_batch_size, n_kv_heads // configuration.num_devices, max_seq_len, head_dim] @@ -272,9 +288,6 @@ def forward_decode( values, v_heads_1BKD, update_idxs_tensor=current_pos, page_table=page_table ) - self.layer_past[0] = keys - self.layer_past[1] = values - ttnn.deallocate(k_heads_1BKD) ttnn.deallocate(v_heads_1BKD) @@ -362,7 +375,7 @@ def forward_decode( dense_out_sharded = ttnn.to_memory_config(dense_out_sharded, self.model_config["DECODE_RESIDUAL_MEMCFG"]) return dense_out_sharded - def forward_prefill(self, x_11SH, rot_mats, user_id: int = 0, page_table=None): + def forward_prefill(self, x_11SH, rot_mats, user_id: int = 0, page_table=None, kv_cache=None): seq_len = x_11SH.shape[-2] assert seq_len % 128 == 0 and seq_len > 0, "Seqlen must be divisible by 128" ### @@ -425,7 +438,10 @@ def forward_prefill(self, x_11SH, rot_mats, user_id: int = 0, page_table=None): ttnn.deallocate(k_heads_1KSD_pre_rot) # Fill KV-Cache - keys_BKSD, values_BKSD = self.layer_past[0], self.layer_past[1] + if kv_cache: + keys_BKSD, values_BKSD = kv_cache[self.layer_num][0], kv_cache[self.layer_num][1] + else: + keys_BKSD, values_BKSD = self.layer_past[0], self.layer_past[1] k_heads_1KSD_8b = ttnn.typecast(k_heads_1KSD, dtype=ttnn.bfloat8_b) v_heads_1VSD_8b = ttnn.typecast(v_heads_1VSD, dtype=ttnn.bfloat8_b) @@ -451,8 +467,14 @@ def forward_prefill(self, x_11SH, rot_mats, user_id: int = 0, page_table=None): ttnn.deallocate(v_heads_1VSD) if page_table: - ttnn.experimental.paged_fill_cache(keys_BKSD, k_fill, page_table, batch_idx=user_id) - ttnn.experimental.paged_fill_cache(values_BKSD, v_fill, page_table, batch_idx=user_id) + # In the case that the tokens have been padded along the seq len dimension, we need to fill the cache with the unpadded k/v values. + # Assume that the page table does not have padding, so we can use it to get the unpadded page len. + block_size = keys_BKSD.shape[2] + page_len = page_table.shape[1] * block_size + k_fill_sliced = k_fill[:, :, :page_len, :] if page_len < k_fill.shape[2] else k_fill + v_fill_sliced = v_fill[:, :, :page_len, :] if page_len < v_fill.shape[2] else v_fill + ttnn.experimental.paged_fill_cache(keys_BKSD, k_fill_sliced, page_table, batch_idx=user_id) + ttnn.experimental.paged_fill_cache(values_BKSD, v_fill_sliced, page_table, batch_idx=user_id) else: ttnn.fill_cache( keys_BKSD, @@ -469,8 +491,6 @@ def forward_prefill(self, x_11SH, rot_mats, user_id: int = 0, page_table=None): ttnn.deallocate(k_fill) ttnn.deallocate(v_fill) - self.layer_past = [keys_BKSD, values_BKSD] - # SDPA # reshaping to put group in batch dim to do sdpa on 8 MQAs in parallel @@ -550,8 +570,17 @@ def forward_prefill(self, x_11SH, rot_mats, user_id: int = 0, page_table=None): else: return output_11SH - def forward(self, x, current_pos, rot_mats=None, user_id=0, mode="decode", page_table=None): + def forward( + self, + x, + current_pos, + rot_mats=None, + user_id=0, + mode="decode", + page_table=None, + kv_cache=None, + ): if mode == "prefill": - return self.forward_prefill(x, rot_mats, user_id, page_table) + return self.forward_prefill(x, rot_mats, user_id, page_table=page_table, kv_cache=kv_cache) else: - return self.forward_decode(x, current_pos, rot_mats, page_table) + return self.forward_decode(x, current_pos, rot_mats, page_table=page_table, kv_cache=kv_cache) diff --git a/models/demos/llama3/tt/llama_common.py b/models/demos/llama3/tt/llama_common.py index b9b5484cb899..fd7f368557f4 100644 --- a/models/demos/llama3/tt/llama_common.py +++ b/models/demos/llama3/tt/llama_common.py @@ -209,6 +209,27 @@ def num_to_core_range_set(x): ) +def copy_host_to_device(host_tensors, device_tensors=None, mesh_device=None): + """ + Helper function which copies host tensors to device tensors. + If no device_tensors are provided, it creates new device tensors and returns them. + """ + if device_tensors is None: + assert mesh_device is not None, "mesh_device is required when device_tensors is None" + ret = [] + for i in range(len(host_tensors)): + on_device = ttnn.to_device(host_tensors[i], device=mesh_device) if host_tensors[i] else None + ret.append(on_device) + return ret + else: + for i in range(len(host_tensors)): + if host_tensors[i] is None: + assert device_tensors[i] is None + continue + ttnn.copy_host_to_device_tensor(host_tensors[i], device_tensors[i]) + return device_tensors + + def calculate_hidden_dim(dim, ffn_dim_multiplier, multiple_of): """Helper function based on logic used in reference model: https://github.com/meta-llama/llama-models/blob/e4a6ed52a142bb9b5106dcbf48e41f97f8e7378e/models/llama3/reference_impl/model.py#L227C7-L231C83 @@ -295,3 +316,29 @@ def sample_host(tt_input, mesh_device, temperature=0.6, top_p=0.08, on_host=True ), pt_out, ) + + +def get_padded_prefill_len(seq_len): + """ + If seq_len is less than 32, pad to 32 + If seq_len is more than 32, pad to whichever is smaller: a power of 2 or a multiple of 1024 + TODO: Generalize for max_mm_seq_len different from 1024 + """ + if seq_len <= 32: + return 32 + pow_2_pad = nearest_pow_2(seq_len) + mult_1024_pad = 1024 * math.ceil(seq_len / 1024) + min_extended_pad = min(pow_2_pad, mult_1024_pad) + return min_extended_pad + + +def get_block_size(kv_cache): + return kv_cache[0][0].shape[2] + + +def num_blocks_in_seq(seq_len, block_size): + return math.ceil(seq_len / block_size) + + +def nearest_pow_2(x): + return 2 ** math.ceil(math.log2(x)) diff --git a/models/demos/llama3/tt/llama_decoder.py b/models/demos/llama3/tt/llama_decoder.py index e5edfce889a6..ad1bdf9b59ab 100644 --- a/models/demos/llama3/tt/llama_decoder.py +++ b/models/demos/llama3/tt/llama_decoder.py @@ -20,6 +20,7 @@ def __init__( weight_cache_path, transformation_mats, paged_attention_config=None, + use_paged_kv_cache=False, ): super().__init__() @@ -48,6 +49,7 @@ def __init__( transformation_mats=transformation_mats, configuration=args, paged_attention_config=paged_attention_config, + use_paged_kv_cache=use_paged_kv_cache, ) self.feed_forward = TtLlamaMLP( mesh_device=mesh_device, @@ -97,6 +99,7 @@ def forward( user_id=0, mode="decode", page_table=None, + kv_cache=None, ) -> ttnn.Tensor: # x is fractured across devices and interleaved in DRAM (for prefill) and sharded in L1 (for decode) skip_mem_cfg = self.model_config["DECODE_RESIDUAL_MEMCFG"] if mode == "decode" else ttnn.DRAM_MEMORY_CONFIG @@ -112,7 +115,8 @@ def forward( rot_mats, user_id, mode, - page_table, + page_table=page_table, + kv_cache=kv_cache, ) # Here x and attn_out are both fractured across devices h = ttnn.add(x, attn_out, memory_config=skip_mem_cfg) diff --git a/models/demos/llama3/tt/llama_embedding.py b/models/demos/llama3/tt/llama_embedding.py index 89b6fb1b3f04..6259c17619f3 100644 --- a/models/demos/llama3/tt/llama_embedding.py +++ b/models/demos/llama3/tt/llama_embedding.py @@ -23,7 +23,7 @@ def __init__( base_name = args.get_state_dict_prefix("", None) + "tok_embeddings.weight" torch_weight = self.state_dict[base_name].unsqueeze(0).unsqueeze(0) - cache_name = weight_cache_path / base_name + cache_name = None if args.dummy_weights else weight_cache_path / base_name self.weights = ttnn.as_tensor( torch_weight, dtype=dtype, diff --git a/models/demos/llama3/tt/llama_model.py b/models/demos/llama3/tt/llama_model.py index e04ed2c4cf8c..9c55182115f8 100644 --- a/models/demos/llama3/tt/llama_model.py +++ b/models/demos/llama3/tt/llama_model.py @@ -14,6 +14,9 @@ from models.common.lightweightmodule import LightweightModule from models.demos.llama3.tt.distributed_norm import DistributedNorm from models.demos.llama3.tt.lm_head import LMHead +from models.demos.llama3.tt.llama_common import copy_host_to_device, get_prefill_rot_mat, HostEmbedding +from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup +from models.demos.llama3.tt.llama_embedding import TtLlamaEmbedding class TtTransformer(LightweightModule): @@ -24,7 +27,6 @@ def __init__( mesh_device, state_dict, weight_cache_path, - transformation_mats, paged_attention_config=None, ): super().__init__() @@ -38,6 +40,24 @@ def __init__( self.grid_size = self.args.max_grid_size state_dict_prefix = args.get_state_dict_prefix("", None) + self.embd = TtLlamaEmbedding( + mesh_device=mesh_device, + args=args, + weight_cache_path=args.weight_cache_path(dtype), + state_dict=state_dict, + dtype=ttnn.bfloat16, # Row major layout requires bfloat16 + ) + + self.rope_setup = TtLlamaRotarySetup( + mesh_device, + args.max_batch_size, + args.head_dim, + args.max_seq_len, + args.rope_theta, + args.use_scaled_rope, + ) + self.trans_mats_dict = self.rope_setup.get_both_trans_mats() + self.layers = [ TtTransformerBlock( args=args, @@ -46,7 +66,7 @@ def __init__( state_dict=state_dict, weight_cache_path=weight_cache_path, layer_num=i, - transformation_mats=transformation_mats, + transformation_mats=self.trans_mats_dict, paged_attention_config=paged_attention_config, ) for i in range(self.n_layers) @@ -76,6 +96,167 @@ def __init__( weight_cache_path=weight_cache_path, ) + def prepare_inputs_prefill(self, tokens, page_table=None): + """ + Inputs are torch tensors or python types. This function returns ttnn + tensors on device. + TODO: Debate whether this function is responsible for padding + """ + + tokens = tokens.reshape(1, 1, 1, -1) + S = tokens.shape[-1] + + tokens = ttnn.from_torch( + tokens, + device=self.mesh_device, + dtype=ttnn.uint32, + layout=ttnn.ROW_MAJOR_LAYOUT, + mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), + ) + + tokens_embd = self.embd(tokens) + + tt_rot_mats_prefill = get_prefill_rot_mat( + self.args.head_dim, self.args.max_seq_len, self.mesh_device, seq_len=S + ) + + if page_table is not None: + tt_page_table = ttnn.from_torch( + page_table, + device=self.mesh_device, + dtype=ttnn.int32, + layout=ttnn.ROW_MAJOR_LAYOUT, + mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), + ) + else: + tt_page_table = None + + return tokens_embd, tt_rot_mats_prefill, tt_page_table + + def prepare_inputs_decode(self, *inputs): + """ + Inputs are torch tensors or python types. This function returns ttnn + tensors on device. + Its implementation can take advantage of a few other functions which the + model must implement. + """ + host_inputs = self.prepare_decode_inputs_host(*inputs) + device_inputs = copy_host_to_device(host_inputs, mesh_device=self.mesh_device) # Helper function + transformed_device_inputs = self.transform_decode_inputs_device(*device_inputs) + return transformed_device_inputs + + def prepare_decode_inputs_host(self, tokens, current_pos, page_table=None): + """ + Inputs are torch tensors or python types. Outputs are ttnn tensors on host. + NOTE: Tokens and current_pos are padded to batch + """ + B = tokens.shape[-1] + assert current_pos.shape[0] == B, "Batch size mismatch" + assert B == self.args.max_batch_size, "Batch size must be equal to max_batch_size" + + tokens = ttnn.from_torch( + tokens, + device=None, + dtype=ttnn.uint32, + mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), + ) + + rope_idxs = self.rope_setup.get_rot_idxs(current_pos, on_host=True) + current_pos_tt = ttnn.from_torch( + current_pos, + device=None, + dtype=ttnn.int32, + mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), + ) + + if page_table is not None: + page_table = ttnn.from_torch( + page_table, + device=None, + dtype=ttnn.int32, + mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), + ) + return tokens, current_pos_tt, rope_idxs, page_table + + def transform_decode_inputs_device(self, tokens, current_pos, rope_idxs, page_table=None): + """ + Inputs are ttnn tensors on device. This function applies any on-device + transformations which should happen before forward decode. + For example: tilize, reshape, shard. + Return transformed device tensors + + Get rope sin/cos + Embed tokens + """ + tt_rot_mats = self.rope_setup.get_rot_mats(rope_idxs) + tt_tokens = self.embd(tokens) + tt_tokens = ttnn.unsqueeze_to_4D(tt_tokens) + return tt_tokens, current_pos, tt_rot_mats, page_table + + def process_output_prefill(self, tt_out, last_token_idx): + """ + Input is ttnn device tensor of logits. Output is torch logits tensor. + NOTE: In this model, prefill always uses get_last_token + """ + logits = ttnn.to_torch(tt_out, mesh_composer=ttnn.ConcatMeshToTensor(self.mesh_device, -1))[ + 0, 0, last_token_idx, : + ] + return logits + + def process_output_decode(self, tt_out): + """ + Input is ttnn device tensor of logits. Output is torch logits tensor + """ + if self.args.num_devices > 1: + tt_out = ttnn.all_gather(tt_out, dim=3, num_links=1, topology=ttnn.Topology.Linear) + tt_out_rm = ttnn.untilize(tt_out, use_multicore=True) + if self.args.num_devices > 1: + return ttnn.to_torch(ttnn.get_device_tensors(tt_out_rm)[0]).float() + else: + return ttnn.to_torch(tt_out_rm).float() + + def ttnn_prefill_forward( + self, + x, + rot_mats, + user_id, + page_table=None, + get_last_token=-1, + ): + """ + This method will take device tensors and any other args to run forward. + It returns ttnn device tensors. + """ + return self.forward( + x, + current_pos=None, + rot_mats=rot_mats, + transformation_mats=None, + user_id=user_id, + mode="prefill", + page_table=page_table, + get_last_token=get_last_token, + ) + + def ttnn_decode_forward( + self, + x, + current_pos, + rot_mats, + page_table=None, + ): + """ + This method will take device tensors and any other args to run forward. + It returns ttnn device tensors. + """ + return self.forward( + x, + current_pos, + rot_mats=rot_mats, + mode="decode", + page_table=page_table, + ) + def forward( self, x: ttnn.Tensor, diff --git a/models/demos/llama3/tt/llama_rope.py b/models/demos/llama3/tt/llama_rope.py index 576ce982e8c5..c1b982308bc3 100644 --- a/models/demos/llama3/tt/llama_rope.py +++ b/models/demos/llama3/tt/llama_rope.py @@ -87,9 +87,21 @@ def __init__( mesh_mapper=ReplicateTensorToMesh(device) if self.is_mesh_device else None, ) - def get_trans_mats(self): + # TODO: Colman, should this be TILE_SIZE or head_dim? Why should it be different for prefill and decode? + prefill_trans_mat_torch = get_rot_transformation_mat(dhead=head_dim) + self.transformation_mat_prefill = ttnn.from_torch( + prefill_trans_mat_torch, + device=device, + layout=ttnn.TILE_LAYOUT, + dtype=datatype, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + mesh_mapper=ReplicateTensorToMesh(device) if self.is_mesh_device else None, + ) + + def get_both_trans_mats(self): assert self.transformation_mat is not None, "Transformation matrix not initialized" - return self.transformation_mat + assert self.transformation_mat_prefill is not None, "Prefill Transformation matrix not initialized" + return {"decode": self.transformation_mat, "prefill": self.transformation_mat_prefill} def get_rot_idxs(self, position_idxs, on_host=False): assert isinstance(position_idxs, torch.Tensor), "Position ids must be a torch tensor" diff --git a/models/demos/llama3/tt/model_config.py b/models/demos/llama3/tt/model_config.py index c3f9f385e7ac..4ddb684fe9c5 100644 --- a/models/demos/llama3/tt/model_config.py +++ b/models/demos/llama3/tt/model_config.py @@ -563,7 +563,7 @@ def find_largest_divisor(n, max_divisor=8): fuse_batch=False, ) self.model_config["VISION_XATTN_DENSE_PROGCFG"] = lambda seq_len: self.matmul_config( - m=seq_len, + m=min(seq_len, 1024), k=self.dim // self.num_devices, n=self.dim, grid_size=(8, 8), @@ -589,23 +589,21 @@ def find_largest_divisor(n, max_divisor=8): fuse_batch=seq_len <= max_seq, ) - xattn_cache_y_cores = ( - 16 // self.num_devices - ) # Based on seqlen, this formula gives us a valid number of y cores - xattn_cache_x_cores = 8 - self.model_config["XATTN_KV_PREFILL_MEM_CFG"] = lambda seq_len: ttnn.create_sharded_memory_config( - # using n_heads since xattn repeats KV to match Q - ( - nearest_32( - (self.n_heads // self.num_devices) * seq_len // (xattn_cache_y_cores * xattn_cache_x_cores) + def _get_xattn_kv_prefill_mem_cfg(seq_len): + M = (self.n_kv_heads // self.num_devices) * seq_len + cores_x, cores_y = self.find_grid(M // self.tile_size) + return ttnn.create_sharded_memory_config( + ( + nearest_32(M // (cores_x * cores_y)), + self.head_dim, ), - self.head_dim, - ), - ttnn.CoreGrid(y=xattn_cache_y_cores, x=xattn_cache_x_cores), - ttnn.ShardStrategy.HEIGHT, - ttnn.ShardOrientation.ROW_MAJOR, - use_height_and_width_as_shard_shape=True, - ) + ttnn.CoreGrid(y=cores_y, x=cores_x), + ttnn.ShardStrategy.HEIGHT, + ttnn.ShardOrientation.ROW_MAJOR, + use_height_and_width_as_shard_shape=True, + ) + + self.model_config["XATTN_KV_PREFILL_MEM_CFG"] = _get_xattn_kv_prefill_mem_cfg self.VISION_MAX_MM_SEQ = nearest_32(self.vision_chunk_ntok) # RMS NORM @@ -648,7 +646,7 @@ def ccl_topology(self): return ttnn.Topology.Linear return None - def prepare_inputs_ttnn_decode(self, x, input_mem_cfg, force_replicated=False, on_host=False): + def prepare_residual_tensor_decode(self, x, input_mem_cfg, force_replicated=False, on_host=False): """ Prepare inputs for decode mode. x: (batch, seq, dim) @@ -698,7 +696,7 @@ def prepare_inputs_ttnn_decode(self, x, input_mem_cfg, force_replicated=False, o x = ttnn.to_layout(x, layout=ttnn.TILE_LAYOUT) return x - def prepare_inputs_ttnn_prefill(self, x_bsh, force_replicated=False): + def prepare_residual_tensor_prefill(self, x_bsh, force_replicated=False): """ Prepare inputs for prefill mode. x: (batch, seq, hidden_dim) diff --git a/models/demos/llama3/tt/multimodal/llama_cross_attention.py b/models/demos/llama3/tt/multimodal/llama_cross_attention.py index d7032fd59ba9..5aa338a012b9 100644 --- a/models/demos/llama3/tt/multimodal/llama_cross_attention.py +++ b/models/demos/llama3/tt/multimodal/llama_cross_attention.py @@ -187,12 +187,7 @@ def compute_xattn_kv_cache(self, xattn_tokens, user_id, xattn_cache): xk = self.k_norm(xk, mode="decode") - # NOTE: Doing repeat in xattn_cache generation to avoid massive overhead in forward - xk = ttnn.repeat_interleave(xk, self.n_local_heads // self.n_local_kv_heads, dim=1) - xv = ttnn.repeat_interleave(xv, self.n_local_heads // self.n_local_kv_heads, dim=1) - k_cache, v_cache = xattn_cache - # Work around fill_cache memory constraint by making these sharded k_fill = ttnn.interleaved_to_sharded(xk, self.model_config["XATTN_KV_PREFILL_MEM_CFG"](seqlen_y)) v_fill = ttnn.interleaved_to_sharded(xv, self.model_config["XATTN_KV_PREFILL_MEM_CFG"](seqlen_y)) @@ -312,27 +307,22 @@ def forward_prefill( xq = self.q_norm(xq, mode="prefill") - scores = ttnn.matmul( - xq, - ttnn.transpose(k_cache_user, -1, -2), - dtype=ttnn.bfloat16, - memory_config=ttnn.DRAM_MEMORY_CONFIG, - compute_kernel_config=self.compute_kernel_config_hifi2, - program_config=self.model_config["VISION_XATTN_SCORE_PROGCFG"](seq_len, cache_seq_len), + program_config = ttnn.SDPAProgramConfig( + compute_with_storage_grid_size=self.mesh_device.compute_with_storage_grid_size(), + q_chunk_size=128, + k_chunk_size=128, + exp_approx_mode=False, ) - scores = ttnn.multiply(scores, self.scale) - # WARNING: This add is buggy if xattn_mask has to be broadcasted to n_local_heads. Workaround is to broadcast on host side - scores = ttnn.add(scores, xattn_mask) - scores = ttnn.softmax(scores, dim=-1, numeric_stable=True) - - output = ttnn.matmul( - scores, + output = ttnn.transformer.scaled_dot_product_attention( + xq, + k_cache_user, v_cache_user, - dtype=ttnn.bfloat16, - memory_config=ttnn.DRAM_MEMORY_CONFIG, - compute_kernel_config=self.compute_kernel_config_hifi4, - program_config=self.model_config["VISION_XATTN_OUTPUT_PROGCFG"](seq_len, cache_seq_len), + is_causal=False, + attn_mask=xattn_mask, + scale=self.scale, + program_config=program_config, + compute_kernel_config=self.compute_kernel_config_sdpa, ) # WARNING: this broadcast is also broken, must broadcast on host diff --git a/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_text.py b/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_text.py index f657abb86726..a7ce9def430e 100644 --- a/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_text.py +++ b/models/demos/llama3/tt/multimodal/llama_cross_attention_transformer_text.py @@ -5,12 +5,15 @@ import math import ttnn import torch +from tqdm import tqdm from models.demos.llama3.tt.llama_decoder import TtTransformerBlock from models.demos.llama3.tt.multimodal.llama_cross_block import TtLlamaCrossAttentionTransformerBlock from models.demos.llama3.tt.distributed_norm import DistributedNorm from models.common.rmsnorm import RMSNorm import ttnn from models.common.lightweightmodule import LightweightModule +from models.demos.llama3.tt.llama_embedding import TtLlamaEmbedding +from models.demos.llama3.tt.llama_rope import TtLlamaRotarySetup from models.utility_functions import ( nearest_32, @@ -41,6 +44,7 @@ def __init__( weight_cache_path, dtype, configuration, + use_paged_kv_cache=False, ): super().__init__() self.vocab_size = configuration.vocab_size @@ -116,12 +120,31 @@ def __init__( self.num_frozen_embeddings = self.tok_embeddings.num_embeddings self._thresh = self.num_frozen_embeddings - 1 + self.rope_setup = TtLlamaRotarySetup( + mesh_device, + configuration.max_batch_size, + configuration.head_dim, + configuration.max_seq_len, + configuration.rope_theta, + configuration.use_scaled_rope, + ) + self.trans_mats_dict = self.rope_setup.get_both_trans_mats() + # transformer blocks self.layers = [] self.cross_attention_layers = [] - for i in range(configuration.n_layers): + for i in tqdm(range(configuration.n_layers), desc="Loading text transformer layers"): layer_id = i - block = TtTransformerBlock(configuration, mesh_device, dtype, state_dict, layer_id, weight_cache_path) + block = TtTransformerBlock( + configuration, + mesh_device, + dtype, + state_dict, + layer_id, + weight_cache_path, + transformation_mats=self.trans_mats_dict, + use_paged_kv_cache=use_paged_kv_cache, + ) self.layers.append(block) if layer_id in self.fusion_schedule: xa_layer_id = self.fusion_schedule.index(layer_id) @@ -224,7 +247,7 @@ def setup_cache(self, max_batch_size): [ ttnn.from_torch( torch.zeros( - max_batch_size, self.configuration.n_heads, vision_seq_len, self.configuration.head_dim + max_batch_size, self.configuration.n_kv_heads, vision_seq_len, self.configuration.head_dim ), device=self.mesh_device, layout=ttnn.TILE_LAYOUT, @@ -247,14 +270,14 @@ def forward( full_text_row_masked_out_mask_11SD: ttnn.Tensor, xattn_caches, current_pos, - rot_mat=None, - transformation_mats=None, + rot_mats=None, user_id=0, mode="decode", page_table=None, - # get_last_token=-1, + kv_cache=None, text_only_inference=False, vision_tokens=None, + get_last_token=-1, ): for idx, ( layer, @@ -275,12 +298,15 @@ def forward( h = layer( h, current_pos, - rot_mat=rot_mat, - transformation_mats=transformation_mats, + rot_mats=rot_mats, user_id=user_id, mode=mode, + page_table=page_table, + kv_cache=kv_cache, ) + if get_last_token != -1: + h = ttnn.slice(h, (0, 0, get_last_token, 0), (1, 1, get_last_token + 32, h.shape[-1])) h = self.norm(h, mode=mode) # TODO: Switch to using dram-sharded LM head and remove this diff --git a/models/demos/llama3/tt/multimodal/llama_cross_block.py b/models/demos/llama3/tt/multimodal/llama_cross_block.py index 1761bc7ac664..9d8c3760af04 100644 --- a/models/demos/llama3/tt/multimodal/llama_cross_block.py +++ b/models/demos/llama3/tt/multimodal/llama_cross_block.py @@ -126,6 +126,11 @@ def forward( user_id=0, vision_tokens=None, ): + skip_mem_cfg = self.model_config["DECODE_RESIDUAL_MEMCFG"] if mode == "decode" else ttnn.DRAM_MEMORY_CONFIG + assert ( + x_11SH.memory_config() == skip_mem_cfg + ), f"decoder input memcfg mismatch: {x_11SH.memory_config()} != {skip_mem_cfg}" + attn_out = self.attention( x_11SH=self.attention_norm(x_11SH, mode=mode), xattn_mask=xattn_mask, diff --git a/models/demos/llama3/tt/multimodal/llama_image_transformer.py b/models/demos/llama3/tt/multimodal/llama_image_transformer.py index ea86d3027488..e9bef2377ed6 100644 --- a/models/demos/llama3/tt/multimodal/llama_image_transformer.py +++ b/models/demos/llama3/tt/multimodal/llama_image_transformer.py @@ -2,10 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional -import torch +from tqdm import tqdm -import ttnn from models.utility_functions import ( nearest_32, ) @@ -41,7 +39,7 @@ def __init__( configuration=configuration, gated=gated, ) - for i in range(layers) + for i in tqdm(range(layers), desc=f"Loading vision transformer layers") ] def forward(self, x, return_intermediate=None, mask=None): diff --git a/models/demos/llama3/tt/multimodal/llama_vision_model.py b/models/demos/llama3/tt/multimodal/llama_vision_model.py index 96149d5a0f9d..0b1f36fd6f4e 100644 --- a/models/demos/llama3/tt/multimodal/llama_vision_model.py +++ b/models/demos/llama3/tt/multimodal/llama_vision_model.py @@ -29,6 +29,7 @@ get_prefill_rot_mat, get_rot_transformation_mat, get_single_rot_mat, + copy_host_to_device, ) from models.utility_functions import ( nearest_32, @@ -128,6 +129,7 @@ def __init__( weight_cache_path, dtype, configuration, + use_paged_kv_cache=False, ) -> None: super().__init__() @@ -159,6 +161,7 @@ def __init__( weight_cache_path=configuration.weight_cache_path(ttnn.bfloat8_b), dtype=ttnn.bfloat8_b, configuration=configuration, + use_paged_kv_cache=use_paged_kv_cache, ) self.image_res = configuration.vision_chunk_size self.max_num_chunks = configuration.vision_max_num_chunks @@ -268,7 +271,6 @@ def compute_vision_tokens_masks( def validate_inputs(self, tokens, position_ids): batch, seq_len = tokens.shape[:2] - assert batch == 1, f"Only batch 1 is supported, got {batch}" assert ( seq_len <= self.configuration.max_seq_len ), f"Sequence length {seq_len} exceeds max sequence length {self.configuration.max_seq_len}" @@ -279,7 +281,9 @@ def prepare_inputs_common(self, position_ids, tokens): h = self.text_model.get_partially_trainable_embedding(tokens) return h - def prepare_inputs_prefill(self, tokens, cross_attention_masks, full_text_row_masked_out_mask, prefill_len): + def prepare_inputs_prefill( + self, tokens, cross_attention_masks, full_text_row_masked_out_mask, prefill_len, page_table=None + ): B = tokens.shape[0] assert B == 1, f"Only batch 1 is supported, got {B}" S = tokens.shape[1] @@ -287,26 +291,16 @@ def prepare_inputs_prefill(self, tokens, cross_attention_masks, full_text_row_ma h = self.prepare_inputs_common(position_ids, tokens) padded_seq_len = _get_padded_prefill_seqlen(S) - tt_position_id = ttnn.from_torch( - position_ids, - device=self.mesh_device, - dtype=ttnn.int32, - layout=ttnn.ROW_MAJOR_LAYOUT, - memory_config=ttnn.DRAM_MEMORY_CONFIG, - mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), - ) - xattn_mask = cross_attention_masks[:, :, position_ids] - xattn_mask_expand = xattn_mask.expand(-1, self.configuration.n_heads // self.configuration.num_devices, -1, -1) - xattn_mask_expand = torch.nn.functional.pad( - xattn_mask_expand, - (0, 0, 0, padded_seq_len - xattn_mask_expand.shape[2]), + xattn_mask = torch.nn.functional.pad( + xattn_mask, + (0, 0, 0, padded_seq_len - xattn_mask.shape[2]), "constant", get_negative_inf_value(torch.float32), ) tt_xattn_mask = ttnn.from_torch( - xattn_mask_expand, + xattn_mask, device=self.mesh_device, dtype=ttnn.bfloat16, layout=ttnn.ROW_MAJOR_LAYOUT, @@ -314,6 +308,7 @@ def prepare_inputs_prefill(self, tokens, cross_attention_masks, full_text_row_ma mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), ) tt_xattn_mask = ttnn.to_layout(tt_xattn_mask, ttnn.TILE_LAYOUT) + tt_xattn_mask = ttnn.typecast(tt_xattn_mask, ttnn.bfloat4_b) full_text_mask = full_text_row_masked_out_mask[:, :, position_ids] full_text_mask = torch.nn.functional.pad( @@ -331,65 +326,75 @@ def prepare_inputs_prefill(self, tokens, cross_attention_masks, full_text_row_ma mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), ) tt_full_text_mask_expand_1NSH = ttnn.to_layout(tt_full_text_mask_expand_1NSH, ttnn.TILE_LAYOUT) + tt_full_text_mask_expand_1NSH = ttnn.typecast(tt_full_text_mask_expand_1NSH, ttnn.bfloat4_b) h = torch.nn.functional.pad(h, (0, 0, 0, padded_seq_len - h.shape[1]), "constant", 0) - tt_h = self.configuration.prepare_inputs_ttnn_prefill( + tt_h = self.configuration.prepare_residual_tensor_prefill( h, ) rot_mats = get_prefill_rot_mat( self.configuration.head_dim, self.configuration.max_seq_len, self.mesh_device, seq_len=S ) - transformation_mat_torch = get_rot_transformation_mat(self.configuration.head_dim) - transformation_mats = ttnn.as_tensor( - transformation_mat_torch, - dtype=ttnn.bfloat16, - layout=ttnn.TILE_LAYOUT, - device=self.mesh_device, - mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), - memory_config=ttnn.DRAM_MEMORY_CONFIG, - ) full_text_mask_expand_11SD = full_text_mask.expand(-1, -1, -1, self.configuration.dim) tt_full_text_mask_expand_11SD = ttnn.from_torch( full_text_mask_expand_11SD, device=self.mesh_device, - dtype=ttnn.bfloat8_b, + dtype=ttnn.bfloat4_b, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, mesh_mapper=ttnn.ShardTensorToMesh(self.mesh_device, dim=-1), ) + if isinstance(page_table, torch.Tensor): + # Support vLLM tensor page_table input + page_table = ttnn.as_tensor( + page_table, + device=self.mesh_device, + memory_config=ttnn.DRAM_MEMORY_CONFIG, + dtype=ttnn.int32, + layout=ttnn.ROW_MAJOR_LAYOUT, + mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), + ) + return ( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_full_text_mask_expand_11SD, - tt_position_id, rot_mats, - transformation_mats, + page_table, ) - def prepare_inputs_decode(self, tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id): + def prepare_inputs_decode( + self, tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id, page_table=None + ): ( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, - _tt_full_text_mask_expand_11SD, tt_position_id, - rot_mats, - _transformation_mats, - ) = self.prepare_decode_inputs_host(tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id) + tt_rope_id, + tt_page_table, + ) = self.prepare_decode_inputs_host( + tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id, page_table=page_table + ) ( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, - rot_mats, - ) = self.copy_host_to_device((tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, rot_mats)) + tt_rope_id, + tt_page_table, + ) = copy_host_to_device( + (tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, tt_position_id, tt_rope_id, tt_page_table), + mesh_device=self.mesh_device, + ) - tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH = self.transform_decode_inputs_device( + tt_h, tt_rot_mats, tt_xattn_mask, tt_full_text_mask_expand_1NSH = self.transform_decode_inputs_device( tt_h, + tt_rope_id, tt_xattn_mask, tt_full_text_mask_expand_1NSH, B=tokens.shape[0], @@ -399,34 +404,35 @@ def prepare_inputs_decode(self, tokens, cross_attention_masks, full_text_row_mas tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, - _tt_full_text_mask_expand_11SD, tt_position_id, - rot_mats, - _transformation_mats, + tt_rot_mats, + tt_page_table, ) - def prepare_decode_inputs_host(self, tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id): + def prepare_decode_inputs_host( + self, tokens, cross_attention_masks, full_text_row_masked_out_mask, position_id, page_table=None + ): B = tokens.shape[0] assert ( B == self.configuration.max_batch_size ), f"Batch size must match max batch size. Got {B}, expected {self.configuration.max_batch_size}" - position_ids = torch.tensor([position_id], dtype=torch.long) - h = self.prepare_inputs_common(position_ids, tokens) - tt_h = self.configuration.prepare_inputs_ttnn_decode( + h = self.prepare_inputs_common(position_id, tokens) + tt_h = self.configuration.prepare_residual_tensor_decode( h, - None, # on_host tensors have no memory_config + None, on_host=True, ) tt_position_id = ttnn.from_torch( - position_ids, + position_id, device=None, dtype=ttnn.int32, layout=ttnn.ROW_MAJOR_LAYOUT, mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), ) - xattn_mask = cross_attention_masks[:, :, position_ids] + tt_rope_id = self.text_model.rope_setup.get_rot_idxs(position_id, on_host=True) + xattn_mask = torch.cat([cross_attention_masks[i][:, :, position_id[i]] for i in range(B)], dim=1).unsqueeze(0) xattn_mask_expand = xattn_mask.expand(-1, self.configuration.n_heads // self.configuration.num_devices, -1, -1) xattn_mask_expand = xattn_mask_expand.transpose(1, 2).contiguous() @@ -438,7 +444,9 @@ def prepare_decode_inputs_host(self, tokens, cross_attention_masks, full_text_ro mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), ) - full_text_mask = full_text_row_masked_out_mask[:, :, position_ids] + full_text_mask = torch.cat( + [full_text_row_masked_out_mask[i][:, :, position_id[i]] for i in range(B)], dim=1 + ).unsqueeze(0) full_text_mask_expand_1NSH = full_text_mask.expand( -1, self.configuration.n_heads // self.configuration.num_devices, -1, self.configuration.head_dim ) @@ -451,44 +459,25 @@ def prepare_decode_inputs_host(self, tokens, cross_attention_masks, full_text_ro mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), ) - rot_mats, _ = get_single_rot_mat( - self.configuration.head_dim, - self.mesh_device, - self.configuration.num_devices, - start_pos=position_ids.item() - 1, # TODO: Change function to support decode batch > 1 - # TODO: B must match max_batch_size, be careful - on_host=True, - ) - - transformation_mats = None - tt_full_text_mask_expand_11SD = None + if isinstance(page_table, torch.Tensor): + # Support vLLM tensor page_table input + page_table = ttnn.as_tensor( + page_table, + dtype=ttnn.int32, + layout=ttnn.ROW_MAJOR_LAYOUT, + mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), + ) return ( tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, - tt_full_text_mask_expand_11SD, tt_position_id, - rot_mats, - transformation_mats, + tt_rope_id, + page_table, ) - def copy_host_to_device(self, host_tensors, device_tensors=None): - """ - Helper function which copies host tensors to device tensors - """ - if device_tensors is None: - ret = [] - for i in range(len(host_tensors)): - on_device = ttnn.to_device(host_tensors[i], device=self.mesh_device) - ret.append(on_device) - return ret - else: - for i in range(len(host_tensors)): - ttnn.copy_host_to_device_tensor(host_tensors[i], device_tensors[i]) - return device_tensors - - def transform_decode_inputs_device(self, tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH, B): + def transform_decode_inputs_device(self, tt_h, tt_rope_id, tt_xattn_mask, tt_full_text_mask_expand_1NSH, B): """ Does any transformations on device tensors which are necessary before ttnn_decode_forward """ @@ -499,6 +488,8 @@ def transform_decode_inputs_device(self, tt_h, tt_xattn_mask, tt_full_text_mask_ tt_h = ttnn.to_memory_config(tt_h, self.configuration.model_config["DECODE_RESIDUAL_MEMCFG"]) + tt_rot_mats = self.text_model.rope_setup.get_rot_mats(tt_rope_id) + tt_xattn_mask = ttnn.to_layout(tt_xattn_mask, ttnn.TILE_LAYOUT) tt_xattn_mask = ttnn.reshape( tt_xattn_mask, @@ -531,12 +522,11 @@ def transform_decode_inputs_device(self, tt_h, tt_xattn_mask, tt_full_text_mask_ ), ) - return (tt_h, tt_xattn_mask, tt_full_text_mask_expand_1NSH) + return (tt_h, tt_rot_mats, tt_xattn_mask, tt_full_text_mask_expand_1NSH) - def process_output_prefill(self, tt_out, B, S): - padded_seq_len = _get_padded_prefill_seqlen(S) + def process_output_prefill(self, tt_out, B, last_token_idx): tt_out = ttnn.to_torch(ttnn.get_device_tensors(tt_out)[0]).float() - tt_out = tt_out[0].reshape(B, padded_seq_len, -1)[:, :S, :] + tt_out = tt_out[0, 0, last_token_idx, :] return tt_out def process_output_decode(self, tt_out, B, S): @@ -554,6 +544,8 @@ def forward( text_only_inference: bool = False, user_id=0, vision_tokens=None, + page_table=None, + kv_cache=None, ) -> torch.Tensor: """ This method takes torch tensors in, returns torch tensors. @@ -573,12 +565,13 @@ def forward( tt_full_text_mask_expand_11SD, tt_position_id, rot_mats, - transformation_mats, + tt_page_table, ) = prepare_fn( tokens, cross_attention_masks, full_text_row_masked_out_mask, pos_arg, + page_table=page_table, ) logits = self.text_model.forward( @@ -588,10 +581,11 @@ def forward( full_text_row_masked_out_mask_11SD=tt_full_text_mask_expand_11SD, xattn_caches=xattn_caches, current_pos=tt_position_id, - rot_mat=rot_mats, - transformation_mats=transformation_mats, + rot_mats=rot_mats, user_id=user_id, mode=mode, + page_table=tt_page_table, + kv_cache=kv_cache, text_only_inference=text_only_inference, vision_tokens=vision_tokens, ) @@ -607,11 +601,12 @@ def ttnn_prefill_forward( full_text_mas_expand_1NSH, full_text_mask_expand_11SD, xattn_caches, - position_id, rot_mats, - transformation_mats, user_id, vision_tokens, + page_table=None, + kv_cache=None, + get_last_token=-1, ): """ This method runs prefill forward. It takes ttnn tensors in, returns ttnn tensors. @@ -622,12 +617,14 @@ def ttnn_prefill_forward( full_text_row_masked_out_mask_1NSH=full_text_mas_expand_1NSH, full_text_row_masked_out_mask_11SD=full_text_mask_expand_11SD, xattn_caches=xattn_caches, - current_pos=position_id, - rot_mat=rot_mats, - transformation_mats=transformation_mats, + current_pos=None, + rot_mats=rot_mats, user_id=user_id, mode="prefill", + page_table=page_table, + kv_cache=kv_cache, vision_tokens=vision_tokens, + get_last_token=get_last_token, ) tt_out = ttnn.to_layout(logits, ttnn.ROW_MAJOR_LAYOUT) return tt_out @@ -640,6 +637,8 @@ def ttnn_decode_forward( xattn_caches, position_id, rot_mats, + page_table=None, + kv_cache=None, ): """ This method runs decode forward. It takes ttnn tensors in, returns ttnn tensors. @@ -651,8 +650,10 @@ def ttnn_decode_forward( full_text_row_masked_out_mask_11SD=None, xattn_caches=xattn_caches, current_pos=position_id, - rot_mat=rot_mats, + rot_mats=rot_mats, mode="decode", + page_table=page_table, + kv_cache=kv_cache, ) tt_out = ttnn.to_layout(logits, ttnn.ROW_MAJOR_LAYOUT) return tt_out @@ -720,11 +721,11 @@ def _pad_masks( def _get_padded_prefill_seqlen(seq_len): """ If seq_len is less than 128, pad to 128 - If seq_len is more than 128, pad to whichever is smaller: a power of 2 or a multiple of 1024 + If seq_len is more than 128, pad to whichever is smaller: a power of 2 or a multiple of 2048 (text model requires mult of 2048, while vision allows mult of 1024) """ if seq_len < 128: return 128 else: - mult_1024 = 1024 * math.ceil(seq_len / 1024) + mult_2k = 2048 * math.ceil(seq_len / 2048) pow_2 = 2 ** math.ceil(math.log2(seq_len)) - return min(mult_1024, pow_2) + return min(mult_2k, pow_2) diff --git a/models/perf/benchmarking_utils.py b/models/perf/benchmarking_utils.py index 5ca7ae269c8c..8136c4ef0c1e 100644 --- a/models/perf/benchmarking_utils.py +++ b/models/perf/benchmarking_utils.py @@ -16,6 +16,24 @@ def __init__(self): self.start_times = dict() self.end_times = dict() + def __call__(self, step_name: str, iteration: int = 0): + # Return a context manager for this step + return self.StepContext(self, step_name, iteration) + + class StepContext: + def __init__(self, profiler, step_name: str, iteration: int): + self.profiler = profiler + self.step_name = step_name + self.iteration = iteration + + def __enter__(self): + self.profiler.start(self.step_name, self.iteration) + return self.profiler + + def __exit__(self, exc_type, exc_val, exc_tb): + self.profiler.end(self.step_name, self.iteration) + return False + def start(self, step_name: str, iteration: int = 0): self.start_times[(iteration, step_name)] = datetime.now(tz=pytz.UTC) diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh index 805fa83e97b7..627769a5a9eb 100755 --- a/tests/scripts/t3000/run_t3000_demo_tests.sh +++ b/tests/scripts/t3000/run_t3000_demo_tests.sh @@ -93,7 +93,7 @@ run_t3000_llama3_vision_tests() { pip install -r models/demos/llama3/requirements.txt for fake_device in "$n300" "$t3k"; do - FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/demo/simple_vision_demo.py -k "cold and yes_trace" --timeout 600; fail+=$? + FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/demo/simple_vision_demo.py -k "batch1-trace" --timeout 600; fail+=$? echo "LOG_METAL: Llama3 vision tests for $fake_device completed" done diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index 6b33b853a079..60c671c6e83b 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -197,8 +197,8 @@ run_t3000_llama3.2-11b-vision_unit_tests() { LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_mlp.py ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_attention.py ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_block.py ; fail+=$? - LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_attention.py ; fail+=$? - LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_block.py ; fail+=$? + LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_attention.py -k "batch_1" ; fail+=$? + LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_block.py -k "batch_1" ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_conv2d_patch.py ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_class_embedding.py ; fail+=$? LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_tile_position_embedding.py ; fail+=$? @@ -232,8 +232,8 @@ run_t3000_spoof_n300_llama3.2-11b-vision_unit_tests() { FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_mlp.py ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_attention.py ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_image_block.py ; fail+=$? - FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_attention.py ; fail+=$? - FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_block.py ; fail+=$? + FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_attention.py -k "batch_1" ; fail+=$? + FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_cross_block.py -k "batch_1" ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_conv2d_patch.py ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_class_embedding.py ; fail+=$? FAKE_DEVICE=$fake_device LLAMA_DIR=$llama11b WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/multimodal/test_llama_tile_position_embedding.py ; fail+=$? diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py index 98bca8dc1a3f..736b2f2db825 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_scaled_dot_product_attention.py @@ -183,8 +183,10 @@ def test_sdpa_tt_with_program_cache(device, b, nh, nkv, s, d, q_chunk_size, k_ch assert device.num_program_cache_entries() == 1 -def run_sdpa_noncausal(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dtype): +def run_sdpa_noncausal(device, b, nh, nkv, sq, d, q_chunk_size, k_chunk_size, dtype, sk=None): torch.manual_seed(1234) + if sk is None: + sk = sq program_config = ttnn.SDPAProgramConfig( compute_with_storage_grid_size=device.compute_with_storage_grid_size(), @@ -200,16 +202,16 @@ def run_sdpa_noncausal(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dty packer_l1_acc=False, ) - Q = fa_rand(b, nh, s, d) - K = fa_rand(b, nkv, s, d) - V = fa_rand(b, nkv, s, d) + Q = fa_rand(b, nh, sq, d) + K = fa_rand(b, nkv, sk, d) + V = fa_rand(b, nkv, sk, d) # Generate random non-causal attention mask mask = torch.bernoulli( torch.full( ( b, - s, - s, + sq, + sk, ), 0.25, ) @@ -240,8 +242,8 @@ def run_sdpa_noncausal(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dty if nkv > 1 and nkv != nh: assert nh % nkv == 0 - K = K.reshape(b, nkv, 1, s, d).repeat(1, 1, nh // nkv, 1, 1).reshape(b, nh, s, d) - V = V.reshape(b, nkv, 1, s, d).repeat(1, 1, nh // nkv, 1, 1).reshape(b, nh, s, d) + K = K.reshape(b, nkv, 1, sk, d).repeat(1, 1, nh // nkv, 1, 1).reshape(b, nh, sk, d) + V = V.reshape(b, nkv, 1, sk, d).repeat(1, 1, nh // nkv, 1, 1).reshape(b, nh, sk, d) gt = torch.nn.functional.scaled_dot_product_attention(Q, K, V, is_causal=False, attn_mask=mask) @@ -274,3 +276,24 @@ def test_sdpa_noncausal(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dt pytest.skip("Bad PCC for small chunks") ttnn.device.DisablePersistentKernelCache() run_sdpa_noncausal(device, b, nh, nkv, s, d, q_chunk_size, k_chunk_size, dtype) + + +@skip_for_blackhole("Mismatching on BH, see #12349") +@pytest.mark.skipif(is_watcher_enabled(), reason="Kernel OOM with watcher enabled") +@skip_for_grayskull("Unsupported in GS since L1 runs OOM with most configs") +@pytest.mark.parametrize("dtype", [ttnn.bfloat8_b, ttnn.bfloat16], ids=["bfp8", "bf16"]) +@pytest.mark.parametrize("q_chunk_size", [128, 256], ids=["q128", "q256"]) +@pytest.mark.parametrize("k_chunk_size", [128, 256], ids=["k128", "k256"]) +@pytest.mark.parametrize( + "b, nh, nkv, sq, sk, d", + ( + [1, 8, 1, 4096, 2048, 128], + # [1, 4, 4, 128*1024, 6528, 128], # Llama-Vision long seq + [1, 4, 1, 2048, 6528, 128], # Llama-Vision + ), +) +def test_sdpa_noncausal_unequal_seqlen(device, b, nh, nkv, sq, sk, d, q_chunk_size, k_chunk_size, dtype): + if (sq % q_chunk_size != 0) or (sk % k_chunk_size != 0): + pytest.skip("s must be divisible by q_chunk_size and k_chunk_size") + ttnn.device.DisablePersistentKernelCache() + run_sdpa_noncausal(device, b, nh, nkv, sq, d, q_chunk_size, k_chunk_size, dtype, sk=sk) diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp index 54e2bd1b4037..976364d32cb0 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp @@ -336,7 +336,7 @@ void MAIN { constexpr uint32_t B = get_compile_time_arg_val(0); constexpr uint32_t NQH = get_compile_time_arg_val(1); constexpr uint32_t NKH = get_compile_time_arg_val(2); - constexpr uint32_t St = get_compile_time_arg_val(3); + constexpr uint32_t Skt = get_compile_time_arg_val(3); constexpr uint32_t DHt = get_compile_time_arg_val(4); constexpr uint32_t Sq_chunk_t = get_compile_time_arg_val(5); constexpr uint32_t q_num_chunks = get_compile_time_arg_val(6); @@ -419,7 +419,7 @@ void MAIN { if constexpr (is_causal) { q_high_idx = q_low_idx + Sq_chunk_t; } else { - q_high_idx = St; + q_high_idx = Skt; } cb_wait_front(cb_q_in, q_chunk_tiles); diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/reader_interleaved.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/reader_interleaved.cpp index 3309205fa34e..8b945b404e81 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/reader_interleaved.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/reader_interleaved.cpp @@ -14,15 +14,16 @@ void kernel_main() { constexpr uint32_t B = get_compile_time_arg_val(0); constexpr uint32_t NQH = get_compile_time_arg_val(1); constexpr uint32_t NKH = get_compile_time_arg_val(2); - constexpr uint32_t St = get_compile_time_arg_val(3); - constexpr uint32_t DHt = get_compile_time_arg_val(4); - constexpr uint32_t Sq_chunk_t = get_compile_time_arg_val(5); - constexpr uint32_t q_num_chunks = get_compile_time_arg_val(6); - constexpr uint32_t Sk_chunk_t = get_compile_time_arg_val(7); - constexpr uint32_t k_num_chunks = get_compile_time_arg_val(8); - constexpr uint32_t num_cores = get_compile_time_arg_val(9); - constexpr uint32_t is_causal = get_compile_time_arg_val(10) == 1; - constexpr uint32_t use_provided_mask = get_compile_time_arg_val(11) == 1; + constexpr uint32_t Sqt = get_compile_time_arg_val(3); + constexpr uint32_t Skt = get_compile_time_arg_val(4); + constexpr uint32_t DHt = get_compile_time_arg_val(5); + constexpr uint32_t Sq_chunk_t = get_compile_time_arg_val(6); + constexpr uint32_t q_num_chunks = get_compile_time_arg_val(7); + constexpr uint32_t Sk_chunk_t = get_compile_time_arg_val(8); + constexpr uint32_t k_num_chunks = get_compile_time_arg_val(9); + constexpr uint32_t num_cores = get_compile_time_arg_val(10); + constexpr uint32_t is_causal = get_compile_time_arg_val(11) == 1; + constexpr uint32_t use_provided_mask = get_compile_time_arg_val(12) == 1; const uint32_t q_addr = get_arg_val(0); const uint32_t k_addr = get_arg_val(1); @@ -82,9 +83,9 @@ void kernel_main() { uint32_t barrier_count = 0; for (uint32_t nb = local_batch_start; nb < local_batch_end; ++nb) { - const uint32_t q_batch_offset = nb * NQH * St * DHt; - const uint32_t kv_batch_offset = nb * NKH * St * DHt; - const uint32_t mask_batch_offset = nb * St * St; + const uint32_t q_batch_offset = nb * NQH * Sqt * DHt; + const uint32_t kv_batch_offset = nb * NKH * Skt * DHt; + const uint32_t mask_batch_offset = nb * Sqt * Skt; for (uint32_t nq = local_nh_start; nq < local_nh_end; ++nq) { for (uint32_t q_iter = 0; q_iter < q_chunks_per_core; ++q_iter) { uint32_t q_chunk; @@ -100,7 +101,7 @@ void kernel_main() { q_chunk = local_q_start + q_iter; #endif - uint32_t q_head_offset = nq * St * DHt; + uint32_t q_head_offset = nq * Sqt * DHt; uint32_t q_chunk_offset = q_chunk * Sq_chunk_t * DHt; q_tile_id = q_batch_offset + q_head_offset + q_chunk_offset; @@ -129,11 +130,11 @@ void kernel_main() { if constexpr (is_causal) { q_high_idx = q_low_idx + Sq_chunk_t; } else { - q_high_idx = St; + q_high_idx = Skt; } const uint32_t kv_head = nq / q_heads_per_kv; - const uint32_t kv_head_offset = kv_head * St * DHt; + const uint32_t kv_head_offset = kv_head * Skt * DHt; // loop while k_low < q_high for (uint32_t k_chunk = 0; (k_chunk * Sk_chunk_t) < q_high_idx; ++k_chunk) { @@ -171,8 +172,7 @@ void kernel_main() { cb_reserve_back(cb_mask_in, mask_chunk_tiles); uint32_t mask_write_ptr = get_write_ptr(cb_mask_in); barrier_count = 0; - mask_tile_id = mask_batch_offset + q_chunk * Sq_chunk_t * St /*row_offset*/ + - k_chunk * Sk_chunk_t /*col_offset*/; + mask_tile_id = mask_batch_offset + q_chunk * Sq_chunk_t * Skt /*row_offset*/ + k_chunk * Sk_chunk_t /*col_offset*/; for (uint32_t row = 0; row < Sq_chunk_t; ++row) { for (uint32_t col = 0; col < Sk_chunk_t; ++col) { noc_async_read_tile(mask_tile_id, mask_reader, mask_write_ptr); @@ -185,7 +185,7 @@ void kernel_main() { } // Strid along columns to get to next row mask_tile_id -= Sk_chunk_t; - mask_tile_id += St; + mask_tile_id += Skt; } noc_async_read_barrier(); cb_push_back(cb_mask_in, mask_chunk_tiles); diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp index 8d5d7d4f673b..5cf07e576e20 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp @@ -136,7 +136,7 @@ void kernel_main() { constexpr uint32_t B = get_compile_time_arg_val(0); constexpr uint32_t NQH = get_compile_time_arg_val(1); constexpr uint32_t NKH = get_compile_time_arg_val(2); - constexpr uint32_t St = get_compile_time_arg_val(3); + constexpr uint32_t Sqt = get_compile_time_arg_val(3); constexpr uint32_t DHt = get_compile_time_arg_val(4); constexpr uint32_t Sq_chunk_t = get_compile_time_arg_val(5); constexpr uint32_t q_num_chunks = get_compile_time_arg_val(6); @@ -184,7 +184,7 @@ void kernel_main() { uint32_t out_tile_id = 0; for (uint32_t nb = local_batch_start; nb < local_batch_end; ++nb) { - const uint32_t q_batch_offset = nb * NQH * St * DHt; + const uint32_t q_batch_offset = nb * NQH * Sqt * DHt; for (uint32_t nq = local_nh_start; nq < local_nh_end; ++nq) { for (uint32_t q_iter = 0; q_iter < q_chunks_per_core; ++q_iter) { uint32_t q_chunk; @@ -200,7 +200,7 @@ void kernel_main() { q_chunk = local_q_start + q_iter; #endif - uint32_t q_head_offset = nq * St * DHt; + uint32_t q_head_offset = nq * Sqt * DHt; uint32_t q_chunk_offset = q_chunk * Sq_chunk_t * DHt; out_tile_id = q_batch_offset + q_head_offset + q_chunk_offset; diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.cpp index 0708eb6645d1..5b3981bedfef 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.cpp @@ -69,50 +69,35 @@ void ScaledDotProductAttention::validate( const auto B = q_shape[0]; const auto nqh = q_shape[1]; const auto nkv = k_shape[1]; - const auto S = q_shape[2]; + const auto Sq = q_shape[2]; const auto DH = q_shape[3]; + const auto Sk = k_shape[2]; + if (this->is_causal) { + TT_FATAL(Sq == Sk, "Causal SDPA requires Q and K to have the same sequence length. Got Q: {}, K: {}", Sq, Sk); + } TT_FATAL(k_shape[0] == B && v_shape[0] == B, "K and V batch must match. Got K: {}, V: {}", k_shape[0], v_shape[0]); TT_FATAL(v_shape[1] == nkv, "K and V num_heads must match. Got K: {}, V: {}", k_shape[1], v_shape[1]); - TT_FATAL( - k_shape[2] == S && v_shape[2] == S, - "K and V sequence length must match. Got K: {}, V: {}", - k_shape[2], - v_shape[2]); - TT_FATAL( - k_shape[3] == DH && v_shape[3] == DH, - "K and V hidden dim must match. Got K: {}, V: {}", - k_shape[3], - v_shape[3]); - TT_FATAL( - nqh >= nkv && nqh % nkv == 0, - "Q num_heads must be >= K num_heads and divisible by K num_heads. Got Q: {}, K: {}", - nqh, - nkv); + TT_FATAL(v_shape[2] == Sk, "K and V sequence length must match. Got K: {}, V: {}", k_shape[2], v_shape[2]); + TT_FATAL(k_shape[3] == DH && v_shape[3] == DH, "K and V hidden dim must match. Got K: {}, V: {}", k_shape[3], v_shape[3]); + TT_FATAL(nqh >= nkv && nqh % nkv == 0, "Q num_heads must be >= K num_heads and divisible by K num_heads. Got Q: {}, K: {}", nqh, nkv); if (mask_option.has_value()) { const auto mask_shape = mask_option.value().get_legacy_shape(); TT_FATAL(mask_shape[0] == B, "Mask batch dim must match Q batch dim"); TT_FATAL(mask_shape[1] == 1, "Mask num_heads must be 1 to be broadcasted across all heads"); - TT_FATAL(mask_shape[2] == S, "Mask sequence length must match Q sequence length"); - TT_FATAL(mask_shape[3] == S, "Mask sequence length must match Q sequence length"); + TT_FATAL(mask_shape[2] == Sq, "Mask sequence length must match Q sequence length"); + TT_FATAL(mask_shape[3] == Sk, "Mask sequence length must match K sequence length"); } if (this->program_config.has_value()) { auto q_chunk_size = program_config->q_chunk_size; auto k_chunk_size = program_config->k_chunk_size; - TT_FATAL( - q_shape[-2] % q_chunk_size == 0, - "q_chunk_size must divide q_shape[-2]. Got q_chunk_size: {}, q_shape[-2]: {}", - q_chunk_size, - q_shape[-2]); - TT_FATAL( - k_shape[-2] % k_chunk_size == 0, - "k_chunk_size must divide k_shape[-2]. Got k_chunk_size: {}, k_shape[-2]: {}", - k_chunk_size, - k_shape[-2]); + TT_FATAL(Sq % q_chunk_size == 0, "q_chunk_size must divide q_shape[-2]. Got q_chunk_size: {}, q_shape[-2]: {}", q_chunk_size, q_shape[-2]); + TT_FATAL(Sk % k_chunk_size == 0, "k_chunk_size must divide k_shape[-2]. Got k_chunk_size: {}, k_shape[-2]: {}", k_chunk_size, k_shape[-2]); + } } diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp index 70eede0127c1..9278d02c8122 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp @@ -41,24 +41,28 @@ operation::ProgramWithCallbacks sdpa_multi_core( const auto q_shape = input_tensor_q.get_legacy_shape(); const auto k_shape = input_tensor_k.get_legacy_shape(); - const uint32_t B = q_shape[0], NQH = q_shape[1], S = q_shape[2], DH = q_shape[3]; + const uint32_t B = q_shape[0], NQH = q_shape[1], Sq = q_shape[2], DH = q_shape[3]; + const uint32_t Sk = k_shape[2]; const uint32_t NKH = k_shape[1]; - const uint32_t St = S / TILE_HEIGHT; + const uint32_t Sqt = Sq / TILE_HEIGHT; + const uint32_t Skt = Sk / TILE_HEIGHT; const uint32_t DHt = DH / TILE_WIDTH; const uint32_t Sq_chunk_t = q_chunk_size / TILE_HEIGHT; const uint32_t Sk_chunk_t = k_chunk_size / TILE_HEIGHT; - const uint32_t q_num_chunks = S / q_chunk_size; - const uint32_t k_num_chunks = S / k_chunk_size; + const uint32_t q_num_chunks = Sq / q_chunk_size; + const uint32_t k_num_chunks = Sk / k_chunk_size; const bool use_provided_mask = attn_mask.has_value(); // log_debug all of the above tt::log_debug("B: {}", B); tt::log_debug("NQH: {}", NQH); - tt::log_debug("S: {}", S); + tt::log_debug("Sq: {}", Sq); + tt::log_debug("Sk: {}", Sk); tt::log_debug("DH: {}", DH); - tt::log_debug("St: {}", St); + tt::log_debug("Sqt: {}", Sqt); + tt::log_debug("Skt: {}", Skt); tt::log_debug("DHt: {}", DHt); tt::log_debug("Sq_chunk_t: {}", Sq_chunk_t); tt::log_debug("Sk_chunk_t: {}", Sk_chunk_t); @@ -216,60 +220,64 @@ operation::ProgramWithCallbacks sdpa_multi_core( scale_union.f = scale.value_or(1.0f); std::vector reader_compile_time_args = {// interleaved accessor args - B, - NQH, - NKH, - St, - DHt, - Sq_chunk_t, - q_num_chunks, - Sk_chunk_t, - k_num_chunks, - num_cores, - (std::uint32_t)is_causal, - (std::uint32_t)use_provided_mask}; + B, + NQH, + NKH, + Sqt, + Skt, + DHt, + Sq_chunk_t, + q_num_chunks, + Sk_chunk_t, + k_num_chunks, + num_cores, + (std::uint32_t)is_causal, + (std::uint32_t)use_provided_mask + }; std::vector writer_compile_time_args = {// interleaved accessor args - B, - NQH, - NKH, - St, - DHt, - Sq_chunk_t, - q_num_chunks, - Sk_chunk_t, - k_num_chunks, - packed_identity_scalar, - scale_union.u, - num_cores, - (std::uint32_t)is_causal, - (std::uint32_t)use_provided_mask}; + B, + NQH, + NKH, + Sqt, + DHt, + Sq_chunk_t, + q_num_chunks, + Sk_chunk_t, + k_num_chunks, + packed_identity_scalar, + scale_union.u, + num_cores, + (std::uint32_t)is_causal, + (std::uint32_t)use_provided_mask + }; std::vector compute_compile_time_args = {// matmul args - B, - NQH, - NKH, - St, - DHt, - Sq_chunk_t, - q_num_chunks, - Sk_chunk_t, - k_num_chunks, - qk_in0_block_w, - qk_out_subblock_w, - qk_out_subblock_h, - qk_in0_num_subblocks, - qk_in1_num_subblocks, - qk_num_blocks, - out_in0_block_w, - out_out_subblock_w, - out_out_subblock_h, - out_in0_num_subblocks, - out_in1_num_subblocks, - out_num_blocks, - num_cores, - (std::uint32_t)is_causal, - (std::uint32_t)use_provided_mask}; + B, + NQH, + NKH, + Skt, + DHt, + Sq_chunk_t, + q_num_chunks, + Sk_chunk_t, + k_num_chunks, + qk_in0_block_w, + qk_out_subblock_w, + qk_out_subblock_h, + qk_in0_num_subblocks, + qk_in1_num_subblocks, + qk_num_blocks, + out_in0_block_w, + out_out_subblock_w, + out_out_subblock_h, + out_in0_num_subblocks, + out_in1_num_subblocks, + out_num_blocks, + num_cores, + (std::uint32_t)is_causal, + (std::uint32_t)use_provided_mask + }; std::map defines; defines["STATS_GRANULARITY"] = std::to_string(stats_granularity); From 9818f7fc8f431556915bc96b2374fab57323c9f3 Mon Sep 17 00:00:00 2001 From: Milan Kordic <78221808+milank94@users.noreply.github.com> Date: Mon, 9 Dec 2024 17:27:30 -0500 Subject: [PATCH 19/59] Update README, instructions, and setup script for Llama 3.1 70B on LoudBox for v0 release (#15820) ### What's changed - Updated root README to modify LLM table to include vLLM release version. - Updated Llama 3.1 70B on LoudBox README with instructions for running vLLM server. - Add "one-line" script for new users to run Llama 3.1 70B + vLLM server demo from end-to-end including build environment, setup weights, and launch server. ### Checklist - [x] Post commit CI passes --- README.md | 63 +++-- models/demos/t3000/llama3_70b/README.md | 123 +++++++-- models/demos/t3000/llama3_70b/setup_llama.sh | 250 +++++++++++++++++++ 3 files changed, 394 insertions(+), 42 deletions(-) create mode 100644 models/demos/t3000/llama3_70b/setup_llama.sh diff --git a/README.md b/README.md index 2765715fea1d..d06621a08fa1 100644 --- a/README.md +++ b/README.md @@ -21,29 +21,34 @@ --- ## LLMs -| Model | Batch | Hardware | ttft (ms) | t/s/u | Target
t/s/u | t/s | Release | -|---------------------------------------------------------------|-------|----------------------------------------------------------|----------|-------|-----------------|--------|---------------------------------------------------------------------------| -| [Falcon7B-decode](./models/demos/ttnn_falcon7b) | 32 | [e150](https://tenstorrent.com/hardware/grayskull) | | 4.2 | 4.4 | 134.4 | | -| [Falcon7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 17.6 | 26 | 563.2 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | -| [Mistral-7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | 316.8 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) | -| [Mamba-2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 48 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) | -| [LLaMA-3.1-8B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 202 | 28.6 | 23 | 28.6 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | -| [LLaMA-3.2-1B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 90.8 | 160 | 90.8 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | -| [LLaMA-3.2-3B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 112 | 49.1 | 60 | 49.1 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | -| [Falcon7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 97 | 14.6 | 26 | 3737.6 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | -| [LLaMA-3.1-70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.53.0-rc36](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc36) | -| [Falcon40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | -| [Mixtral7Bx8 (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 230 | 14.6 | 33 | 467.2 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | -| [Falcon7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 242 | 4.4 | 26 | 4505.6 | [v0.53.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc33) | -| [LLaMA-3.1-70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | -> **Last Update:** December 2, 2024 +| Model | Batch | Hardware | ttft (ms) | t/s/u | Target
t/s/u | t/s | TT-Metalium Release | vLLM Tenstorrent Repo Release | +|---------------------------------------------------------------|-------|----------------------------------------------------------|-----------|-------|-----------------|--------|---------------------------------------------------|---------------------------------------------------------------------------------------------------| +| [Falcon 7B (decode only)](./models/demos/ttnn_falcon7b) | 32 | [e150](https://tenstorrent.com/hardware/grayskull) | | 4.2 | 4.4 | 134.4 | | | +| [Falcon 7B](./models/demos/wormhole/falcon7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 17.6 | 26 | 563.2 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | | +| [Mistral 7B](./models/demos/wormhole/mistral7b) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | | 9.9 | 25 | 316.8 | [v0.51.0-rc28](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc28) | | +| [Mamba 2.8B](./models/demos/wormhole/mamba) | 32 | [n150](https://tenstorrent.com/hardware/wormhole) | 48 | 12.3 | 41 | 393.6 | [v0.51.0-rc26](https://github.com/tenstorrent/tt-metal/tree/v0.51.0-rc26) | | +| [Llama 3.1 8B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 202 | 28.6 | 23 | 28.6 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | | +| [Llama 3.2 1B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 71 | 90.8 | 160 | 90.8 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | | +| [Llama 3.2 3B](./models/demos/llama3) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 112 | 49.1 | 60 | 49.1 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | | +| [Falcon 7B (DP=8)](./models/demos/t3000/falcon7b) | 256 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 97 | 14.6 | 26 | 3737.6 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | | +| [Llama 3.1 70B (TP=8)](./models/demos/t3000/llama3_70b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 190 | 15.1 | 20 | 483.2 | [v0.53.0-rc36](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc36) | [384f179](https://github.com/tenstorrent/vllm/tree/384f1790c3be16e1d1b10de07252be2e66d00935) | +| [Falcon 40B (TP=8)](./models/demos/t3000/falcon40b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | | 5.3 | 36 | 169.6 | [v0.53.1-rc7](https://github.com/tenstorrent/tt-metal/tree/v0.53.1-rc7) | | +| [Mixtral 8x7B (TP=8)](./models/demos/t3000/mixtral8x7b) | 32 | [QuietBox](https://tenstorrent.com/hardware/tt-quietbox) | 230 | 14.6 | 33 | 467.2 | [v0.53.0-rc44](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc44) | | +| [Falcon 7B (DP=32)](./models/demos/tg/falcon7b) | 1024 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 242 | 4.4 | 26 | 4505.6 | [v0.53.0-rc33](https://github.com/tenstorrent/tt-metal/tree/v0.53.0-rc33) | | +| [Llama 3.1 70B (DP=4, TP=8)](./models/demos/t3000/llama3_70b) | 128 | [Galaxy](https://tenstorrent.com/hardware/galaxy) | 190 | 14.3 | 20 | 1835.5 | [v0.52.0-rc31](https://github.com/tenstorrent/tt-metal/tree/v0.52.0-rc31) | | + +> **Last Update:** December 7, 2024 +> > **Notes:** +> +> - ttft = time to first token | t/s/u = tokens/second/user | t/s = tokens/second; where t/s = t/s/u * batch. > - TP = Tensor Parallel, DP = Data Parallel; Defines parallelization factors across multiple devices. > - The reported LLM performance is for an input sequence length (number of rows filled in the KV cache) of 128 for all models except Mamba (which can accept any sequence length). > - The t/s/u reported is the throughput of the first token generated after prefill, i.e. 1 / inter token latency. ## CNNs + | Model | Batch | Hardware | fps | Target fps | Release | |-----------------------------------------------------------------------------|-------|----------------------------------------------------------|---------|------------|-------------| | [ResNet-50 (224x224)](./models/demos/grayskull/resnet50) | 20 | [e150](https://tenstorrent.com/hardware/grayskull) | 5,100 | 10,000 | | @@ -55,11 +60,11 @@ | [ViT (224x224)](./models/demos/grayskull/vit) | 9 | [e150](https://tenstorrent.com/hardware/grayskull) | 1,360 | 2,000 | | | [ViT (224x224)](./models/demos/wormhole/vit) | 8 | [n150](https://tenstorrent.com/hardware/wormhole) | 912 | 1,600 | | | [Stable Diffusion 1.4 (512x512)](./models/demos/wormhole/stable_diffusion) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 0.167 | 0.3 | | -| [Yolo V4 (320x320)](./models/demos/yolov4) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 95 | 300 | | -| [Segformer Semantic Segmentation (512x512)](./models/demos/segformer) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 90 | 300 | | - +| [YOLOv4 (320x320)](./models/demos/yolov4) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 95 | 300 | | +| [SegFormer Semantic Segmentation (512x512)](./models/demos/segformer) | 1 | [n150](https://tenstorrent.com/hardware/wormhole) | 90 | 300 | | ## NLPs + | Model | Batch | Hardware | sen/sec | Target sen/sec | Release | |-----------------------------------------------------|-------|----------------------------------------------------|---------|----------------|---------| | [BERT-Large](./models/demos/metal_BERT_large_11/) | 12 | [e150](https://tenstorrent.com/hardware/grayskull) | 370 | 410 | | @@ -68,9 +73,11 @@ | [Bloom](.models/demos/grayskull/functional_bloom) | | [e150](https://tenstorrent.com/hardware/grayskull) | 70 | | | ## Model Updates + For the latest model updates and features, please see [MODEL_UPDATES.md](models/MODEL_UPDATES.md) ## TT-NN Tech Reports + - [Advanced Performance Optimizations for Models](./tech_reports/AdvancedPerformanceOptimizationsForModels/AdvancedPerformanceOptimizationsForModels.md) (updated Dec 4th) - [Programming Mesh of Devices](./tech_reports/Programming%20Mesh%20of%20Devices/Programming%20Mesh%20of%20Devices%20with%20TT-NN.md) (updated Sept 9th) - [ViT Implementation in TT-NN on GS](./tech_reports/ViT-TTNN/vit.md) (updated Sept 22nd) @@ -78,8 +85,8 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/ - [YOLOv4 Implementation in TT-NN on WH](./tech_reports/YoloV4-TTNN/yolov4.md) (updated November 8th) ## Benchmarks -- [Matrix Multiply FLOPS on WH](./tech_reports/GEMM_FLOPS/GEMM_FLOPS.md) (updated November 13th) +- [Matrix Multiply FLOPS on WH](./tech_reports/GEMM_FLOPS/GEMM_FLOPS.md) (updated November 13th) --- @@ -89,7 +96,6 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/ **TT-Metalium** is our low-level programming model, enabling kernel development for Tenstorrent hardware. -

[Programming Guide](./METALIUM_GUIDE.md) | [API Reference](https://docs.tenstorrent.com/tt-metalium/latest/tt_metal/apis/index.html) @@ -102,6 +108,7 @@ For the latest model updates and features, please see [MODEL_UPDATES.md](models/ Get started with [simple kernels](https://docs.tenstorrent.com/tt-metalium/latest/tt_metal/examples/index.html). ## TT-Metalium Tech Reports + - [Matrix Engine](./tech_reports/matrix_engine/matrix_engine.md) (updated Sept 6th) - [Data Formats](./tech_reports/data_formats/data_formats.md) (updated Sept 7th) - [Reconfiguring Data Formats](./tech_reports/data_formats/reconfig_data_format.md) (updated Oct 17th) @@ -113,24 +120,36 @@ Get started with [simple kernels](https://docs.tenstorrent.com/tt-metalium/lates - [CNNs on TT Architectures](./tech_reports/CNNs/ttcnn.md) (updated Sept 6th) - [Ethernet and Multichip Basics](./tech_reports/EthernetMultichip/BasicEthernetGuide.md) (Updated Sept 20th) - [Collective Communication Library (CCL)](./tech_reports/EthernetMultichip/CclDeveloperGuide.md) (Updated Sept 20th) -- [Blackhole Bring-Up Prgramming Guide](./tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md) (Updated Oct 30th) +- [Blackhole Bring-Up Programming Guide](./tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md) (Updated Oct 30th) ## TT-Metalium Programming Examples + ### Hello World + - [Hello World! Compute Kernel](./tech_reports/prog_examples/hello_world_compute/hello_world_compute.md) - [Hello World! Data Movement Kernel](./tech_reports/prog_examples/hello_world_data_movement/hello_world_data_movement.md) + ### Add Integers + - [Add 2 Integers in Baby RiscV](./tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md) - [Add 2 Integers in Compute Kernel](./tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md) + ### Simple Tensor Manipulation + - [Sharding](./tech_reports/prog_examples/shard_data_rm/shard_data_rm.md) - [Padding](./tech_reports/prog_examples/pad_multi_core/pad_multi_core.md) + ### DRAM Data Movement + - [Dram Loopback Data Movement](./tech_reports/prog_examples/dram_loopback/dram_loopback.md) + ### Eltwise + - [Eltwise Unary OP in Vector Engine (SFPU)](./tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md) - [Eltwise Binary OP in Matrix Engine (FPU)](./tech_reports/prog_examples/eltwise_binary/eltwise_binary.md) + ### Matmul + - [Matmul OP on a Single_core](./tech_reports/prog_examples/matmul_single_core/matmul_single_core.md) - [Matmul OP on Multi_core (Basic)](./tech_reports/prog_examples/matmul_multi_core/matmul_multi_core.md) - [Matmul Multi_core Reuse (Optimized)](./tech_reports/prog_examples/matmul_multi_core_optimized/data_reuse.md) diff --git a/models/demos/t3000/llama3_70b/README.md b/models/demos/t3000/llama3_70b/README.md index 6555cd36dbfc..80f344040d41 100644 --- a/models/demos/t3000/llama3_70b/README.md +++ b/models/demos/t3000/llama3_70b/README.md @@ -1,32 +1,74 @@ -# Llama3-70B Demo +# Llama3/3.1-70B Demo + +## Table of Contents + +- [One command run](#one-command-run) +- [How to Run](#how-to-run) + - [Running the demo from TT-Metalium](#running-the-demo-from-tt-metalium) + - [Serving the model from vLLM](#serving-the-model-from-vllm) + +## One command run + +```bash +chmod +x ./models/demos/t3000/llama3_70b/setup_llama.sh && ./models/demos/t3000/llama3_70b/setup_llama.sh +``` + +Where, `TT_METAL_COMMIT_SHA_OR_TAG` and `TT_VLLM_COMMIT_SHA_OR_TAG` are found in the root [README](/README.md#llms) under "Release" version, respectively. + +Example: + +```bash +./models/demos/t3000/llama3_70b/setup_llama.sh llama-3.1-70b-instruct v0.53.0-rc36 384f1790c3be16e1d1b10de07252be2e66d00935 +``` + +Follow prompts as they come up in CLI to select appropriate weights for Llama 3.1 70B Instruct. + +Prerequisites: + +- Submit request to access weights from Meta: [Llama Downloads](https://www.llama.com/llama-downloads) +- Submit permissions on HuggingFace and have a HF personal access token: [Llama 3.1 70B Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) + +Steps run: + +- Setup environment +- Build `tt-metal` +- Download Llama 3.1 70B Instruct weights +- Install vLLM +- Deploy vLLM server ## How to Run -1. **Download the Llama3-70B weights from Meta (https://llama.meta.com/):** +Note: This guide requires the installation / build of `tt-metal`. Please refer to the [installation instructions](/INSTALLING.md) for the release corresponding to [README](/README.md#llms). + +1. **Download the Llama3/3.1-70B weights from Meta ():** 2. **Repack the weights:** + ```bash # This concatenates the sharded checkpoints and makes it easier for us to load. python models/demos/t3000/llama2_70b/scripts/repack_weights.py ``` + Note: Use `5` for `chunk_size`. Once the weights are repacked, move the `params.json` file from the `checkpoint_dir` to the `repacked_output_dir`. -### Running the Demo +### Running the demo from TT-Metalium After setting up the repacked weights and tokenizer, you can run the demo using the commands below: 1. **Prepare the weight cache directory:** + ```bash # Make a directory for us to cache weights into. This speeds up subsequent runs. mkdir ``` 2. **Set up environment variables:** + ```bash export LLAMA3_CKPT_DIR= - export LLAMA3_TOKENIZER_PATH= # Path needs to include the tokenizer.model file + export LLAMA3_TOKENIZER_PATH=/tokenizer.model # Path needs to include the tokenizer.model file export LLAMA3_CACHE_PATH= export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml @@ -38,13 +80,11 @@ After setting up the repacked weights and tokenizer, you can run the demo using # export LLAMA3_CKPT_DIR="/home/llama-data-repacked/llama-3-70b/" # export LLAMA3_TOKENIZER_PATH="/home/llama-data-repacked/tokenizer.model" # export LLAMA3_CACHE_PATH="/home/llama-data-cache/weights-cache" - - ``` 3. **Run the demo:** - NOTE: Run the following comand twice. + Note: Run the following command twice. 1. The first run will cache the weights. This will take some time. 2. The second run will use the cached weights, thereby running much faster. @@ -58,31 +98,74 @@ After setting up the repacked weights and tokenizer, you can run the demo using The above demo does not achieve peak performance because we log outputs to the screen. The following perf test will print an accurate end-to-end throughput number. For best performance, ensure that tt-metal is built in release mode (default), and ensure the host's CPU frequency governors are set to `performance` -- instructions for setting the frequency governor vary by machine. This performance test runs with sequence length 128 and batch size 32. + ```bash pytest -svv models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py::test_Llama_perf_host[wormhole_b0-True-device_params0-gen128-llama3] ``` -## Details +#### Details Supported context lengths and batch sizes for the Llama3.1-70B demo are as follows: | Context Length | Max Batch Size | -|----------------|------------| -| 2k | 32 | -| 8k | 16 | -| 128k | 1 | +|----------------|----------------| +| 2k | 32 | +| 8k | 16 | +| 128k | 1 | - **Input File:** Uses `./demo/data/multi_prompt.json`. - **Model Configuration:** Utilizes a pretrained model. - **Hardware Requirements:** Runs on an 8-chip T3000 machine using tensor parallelism. The host machine must have at least 512 GB of memory. - **Demo arguments:** - - `context: [short_context, long_context, 128k_context]`: Select between short context (batch 32, sequence_length 2k) and long context (batch 16, sequence length 8k) and full context (batch 1, sequence length 128k) - - `ground_truth: [check_disabled, check_enabled]`: Enable or disable ground truth checking, used for testing - - `sampling: [greedy, sampling]`: Select between greedy decoding and top-k/top-p sampling - - `implementation: [tt-70b-T3000]`: Run the 70B model on the Tenstorrent backend - - `num_layers: [1L, 2L, 10L, 80L]`: Select 80L to run the full model - - `decode_only: [decode_only, prefill_decode]`: Use `prefill_decode`. Alternately, `decode_only` implements prefill via decode. - - `chat: [text_completion, chat_completion]`: Run in text_completion mode for the pretrained model or chat_completion for the finetuned model - - `llama_version: [llama3, llama2]`: Select the Llama3 model + - `context: [short_context, long_context, 128k_context]`: Select between short context (batch 32, sequence_length 2k) and long context (batch 16, sequence length 8k) and full context (batch 1, sequence length 128k) + - `ground_truth: [check_disabled, check_enabled]`: Enable or disable ground truth checking, used for testing + - `sampling: [greedy, sampling]`: Select between greedy decoding and top-k/top-p sampling + - `implementation: [tt-70b-T3000]`: Run the 70B model on the Tenstorrent backend + - `num_layers: [1L, 2L, 10L, 80L]`: Select 80L to run the full model + - `decode_only: [decode_only, prefill_decode]`: Use `prefill_decode`. Alternately, `decode_only` implements prefill via decode. + - `chat: [text_completion, chat_completion]`: Run in text_completion mode for the pretrained model or chat_completion for the finetuned model + - `llama_version: [llama3, llama2]`: Select the Llama3 model Ensure you follow these guidelines to successfully run the Llama3-70B demo. + +### Serving the model from vLLM + +1. Complete Step 1 and Step 2 of [Running the Demo from TT-Metalium](#running-the-demo-from-tt-metalium) + +2. **Install vLLM** + + ```bash + # Installing from within `tt-metal` + export VLLM_TARGET_DEVICE="tt" + git clone https://github.com/tenstorrent/vllm.git + cd vllm + git checkout TT_VLLM_COMMIT_SHA_OR_TAG + pip install -e . + cd .. + ``` + + > **Note:** TT_VLLM_COMMIT_SHA_OR_TAG is the vLLM Release version from [README](/README.md#llms) + +3. **Running the server** + + ```bash + python vllm/examples/server_example_tt.py + ``` + +4. **Interact with server** + + In a separate terminal window, run: + + ```bash + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3.1-70B", + "prompt": "Write a poem about RISC-V", + "max_tokens": 128, + "temperature": 1, + "top_p": 0.9, + "top_k": 10, + "stream": false + }' + ``` diff --git a/models/demos/t3000/llama3_70b/setup_llama.sh b/models/demos/t3000/llama3_70b/setup_llama.sh new file mode 100644 index 000000000000..636ce070b2b4 --- /dev/null +++ b/models/demos/t3000/llama3_70b/setup_llama.sh @@ -0,0 +1,250 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +# +# Purpose: Setup and deploy Llama 3.1 70B Instruct model with dependencies. + +set -euo pipefail + +# Function to display usage information +usage() { + cat < + +Description: + This script sets up and deploys the Llama model along with its dependencies. + +Arguments: + The type of model to deploy. Supported options: + - llama-3.1-70b-instruct + - llama-3.1-70b + - llama-3.1-8b-instruct + - llama-3.1-8b + - llama-3-70b-instruct + - llama-3-70b + - llama-3-8b-instruct + - llama-3-8b + The commit SHA or tag to use for TT_METAL. + The commit SHA or tag to use for vLLM. + +Options: + -h, --help Display this help message. + +Examples: + # Deploy the llama-3.1-70b-instruct model + $0 llama-3.1-70b-instruct main dev + + # Deploy with specific commit SHAs + $0 llama-3.1-70b-instruct v0.53.0-rc36 384f1790c3be16e1d1b10de07252be2e66d00935 + +EOF + exit 0 +} + +# helper +if [[ "$1" == "-h" || "$1" == "--help" ]]; then + usage +fi + +# Require commit SHA or tag for TT_METAL and vLLM +TT_METAL_COMMIT_SHA_OR_TAG=${2:-""} +TT_VLLM_COMMIT_SHA_OR_TAG=${3:-""} + +# Ensure required arguments are passed +if [[ -z "${TT_METAL_COMMIT_SHA_OR_TAG}" || -z "${TT_VLLM_COMMIT_SHA_OR_TAG}" ]]; then + echo "❌ Error: Both TT_METAL_COMMIT_SHA_OR_TAG and TT_VLLM_COMMIT_SHA_OR_TAG are required." + usage +fi + +# Defined variables +DEFAULT_PERSISTENT_VOLUME_ROOT=~/persistent_volume +DEFAULT_LLAMA_REPO=~/llama-models + +# functions +error_exit() { + echo "⛔ Error: $1" >&2 + exit 1 +} + +print_step() { + echo -e "\n👉 $1...\n" +} + +setup_model_environment() { + print_step "Setting up model environment for $1" + case "$1" in + "llama-3.1-70b-instruct") + MODEL="llama-3.1-70b-instruct" + META_MODEL_NAME="Meta-Llama-3.1-70B-Instruct" + META_DIR_FILTER="llama3_1" + REPACKED=1 + ;; + "llama-3.1-70b") + MODEL="llama-3.1-70b" + META_MODEL_NAME="Meta-Llama-3.1-70B" + META_DIR_FILTER="llama3_1" + REPACKED=1 + ;; + "llama-3.1-8b-instruct") + MODEL="llama-3.1-8b-instruct" + META_MODEL_NAME="Meta-Llama-3.1-8B-Instruct" + META_DIR_FILTER="llama3_1" + REPACKED=0 + ;; + "llama-3.1-8b") + MODEL_NAME="llama-3.1-8b" + META_MODEL_NAME="Meta-Llama-3.1-8B" + META_DIR_FILTER="llama3_1" + REPACKED=0 + ;; + "llama-3-70b-instruct") + MODEL="llama-3-70b-instruct" + META_MODEL_NAME="Meta-Llama-3-70B-Instruct" + META_DIR_FILTER="llama3" + REPACKED=1 + ;; + "llama-3-70b") + MODEL="llama-3-70b" + META_MODEL_NAME="Meta-Llama-3-70B" + META_DIR_FILTER="llama3" + REPACKED=1 + ;; + "llama-3-8b-instruct") + MODEL="llama-3-8b-instruct" + META_MODEL_NAME="Meta-Llama-3-8B-Instruct" + META_DIR_FILTER="llama3" + REPACKED=0 + ;; + "llama-3-8b") + MODEL="llama-3-8b" + META_MODEL_NAME="Meta-Llama-3-8B" + META_DIR_FILTER="llama3" + REPACKED=0 + ;; + *) + echo "⛔ Invalid model choice." + usage + exit 1 + ;; + esac + + if [ "${REPACKED}" -eq 1 ]; then + echo "REPACKED is enabled." + REPACKED_STR="repacked-" + else + echo "REPACKED is disabled." + REPACKED_STR="" + fi +} + +setup_environment() { + print_step "Setting up environment" + export LLAMA3_CKPT_DIR="${DEFAULT_PERSISTENT_VOLUME_ROOT}/model_weights/${REPACKED_STR}${MODEL}" + export LLAMA3_TOKENIZER_PATH="${LLAMA3_CKPT_DIR}/tokenizer.model" + export LLAMA3_CACHE_PATH="${DEFAULT_PERSISTENT_VOLUME_ROOT}/tt_metal_cache/cache_${REPACKED_STR}${MODEL}" + export ARCH_NAME=wormhole_b0 + export TT_METAL_HOME=$(pwd) + export PYTHONPATH=$(pwd) + echo "Environment variables set." +} + +check_and_build_tt_metal() { + print_step "Checking and building tt-metal" + pushd "${TT_METAL_HOME}" >/dev/null + if [[ ! -d "python_env" ]]; then + git checkout "${TT_METAL_COMMIT_SHA_OR_TAG}" + git submodule update --init --recursive + git submodule foreach 'git lfs fetch --all && git lfs pull' + ./build_metal.sh + ./create_venv.sh + source python_env/bin/activate + pip install -r models/demos/t3000/llama2_70b/reference/llama/requirements.txt + else + echo "🔔 tt-metal Python environment already exists. Skipping build." + source python_env/bin/activate + fi + popd >/dev/null +} + +clone_repo() { + local REPO_PATH=$1 + local REPO_URL=$2 + local COMMIT_HASH=$3 + + print_step "Cloning Llama repository" + if [[ ! -d "${REPO_PATH}" ]]; then + git clone "${REPO_URL}" "${REPO_PATH}" + pushd "${REPO_PATH}" >/dev/null + git checkout "${COMMIT_HASH}" + popd >/dev/null + else + echo "🔔 Repository already exists at ${REPO_PATH}, skipping clone." + fi +} + +setup_weights() { + print_step "Setting up weights" + local LLAMA_REPO=$1 + local LLAMA_DIR="${LLAMA_REPO}/models/${META_DIR_FILTER}" + local LLAMA_WEIGHTS_DIR="${LLAMA_DIR}/${META_MODEL_NAME}" + local WEIGHTS_DIR="${LLAMA3_CKPT_DIR}" + + mkdir -p "${WEIGHTS_DIR}" "${LLAMA3_CACHE_PATH}" + + if [[ -d "${LLAMA_WEIGHTS_DIR}" && -n "$(ls -A "${LLAMA_WEIGHTS_DIR}")" ]]; then + echo "Weights already downloaded in ${LLAMA_WEIGHTS_DIR}" + else + print_step "Downloading weights" + pushd "${LLAMA_DIR}" >/dev/null + [[ -x "./download.sh" ]] && ./download.sh || error_exit "Download script not found!" + popd >/dev/null + fi + + huggingface-cli login + + if [ "${REPACKED}" -eq 1 ]; then + print_step "Repacking weights" + source python_env/bin/activate + cp "${LLAMA_WEIGHTS_DIR}/tokenizer.model" "${WEIGHTS_DIR}/tokenizer.model" + cp "${LLAMA_WEIGHTS_DIR}/params.json" "${WEIGHTS_DIR}/params.json" + python models/demos/t3000/llama2_70b/scripts/repack_weights.py "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" 5 + else + cp -rf "${LLAMA_WEIGHTS_DIR}" "${WEIGHTS_DIR}" + fi + + echo "🔔 Using weights directory ${WEIGHTS_DIR}" +} + +install_vllm() { + print_step "Installing vLLM" + if [[ ! -d "vllm" ]]; then + source python_env/bin/activate + export VLLM_TARGET_DEVICE="tt" + git clone https://github.com/tenstorrent/vllm.git + pushd vllm >/dev/null + git checkout "${TT_VLLM_COMMIT_SHA_OR_TAG}" + pip install -e . + popd >/dev/null + else + echo "🔔 vLLM already installed. Skipping install." + fi +} + +deploy_server() { + print_step "Deploying Llama server" + source python_env/bin/activate + export WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml + python vllm/examples/server_example_tt.py + echo "✅ Deployment complete! Interact via http://localhost:8000." +} + +# ---- MAIN ---- +MODEL_TYPE=$1 +setup_model_environment "$MODEL_TYPE" +setup_environment +check_and_build_tt_metal +clone_repo "${DEFAULT_LLAMA_REPO}" "https://github.com/meta-llama/llama-models.git" "685ac4c107c75ce8c291248710bf990a876e1623" +setup_weights "${DEFAULT_LLAMA_REPO}" +install_vllm +deploy_server From e3526deadbcac2c7f1d4c4719635050f4f85e99d Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Mon, 9 Dec 2024 17:36:58 -0500 Subject: [PATCH 20/59] #15061: Refactor utilities related to Mesh infra (#15757) ### Ticket #15061 ### What's changed * Refactor `DistributedTensorConfig` in it's own header * Use typed `struct` to represent `MeshShape` and `MeshOffset` ### Checklist - [X] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12210236362) - [X] New/Existing tests provide coverage for changes --- conftest.py | 2 +- .../tt_metal/distributed/test_distributed.cpp | 2 +- .../unit_tests/gtests/test_ccl_on_galaxy.cpp | 2 +- .../sources/ttml/core/distributed_mapping.hpp | 20 +++---- tt-train/tests/core/distributed_test.cpp | 2 +- tt_metal/distributed/mesh_device.cpp | 48 ++++++++-------- tt_metal/distributed/mesh_device.hpp | 5 +- tt_metal/distributed/mesh_device_view.cpp | 4 +- tt_metal/distributed/mesh_device_view.hpp | 5 +- ttnn/CMakeLists.txt | 1 + ttnn/cpp/ttnn/distributed/api.cpp | 50 ++++++++-------- ttnn/cpp/ttnn/distributed/api.hpp | 10 +++- .../ttnn/distributed/distributed_pybind.cpp | 48 ++++++++++++++-- .../distributed/distributed_tensor_config.cpp | 57 +++++++++++++++++++ .../distributed/distributed_tensor_config.hpp | 43 ++++++++++++++ ttnn/cpp/ttnn/distributed/types.hpp | 2 + ttnn/cpp/ttnn/tensor/tensor.cpp | 2 +- ttnn/cpp/ttnn/tensor/tensor.hpp | 1 + ttnn/cpp/ttnn/tensor/tensor_ops.cpp | 8 +-- ttnn/cpp/ttnn/tensor/tensor_utils.hpp | 35 ------------ ttnn/cpp/ttnn/tensor/types.cpp | 52 +---------------- ttnn/cpp/ttnn/tensor/types.hpp | 26 +-------- ttnn/ttnn/__init__.py | 1 + ttnn/ttnn/distributed/distributed.py | 9 ++- ttnn/ttnn/types.py | 15 +---- 25 files changed, 245 insertions(+), 205 deletions(-) create mode 100644 ttnn/cpp/ttnn/distributed/distributed_tensor_config.cpp create mode 100644 ttnn/cpp/ttnn/distributed/distributed_tensor_config.hpp diff --git a/conftest.py b/conftest.py index f1a753ca2432..6e43d1a64999 100644 --- a/conftest.py +++ b/conftest.py @@ -257,7 +257,7 @@ def pcie_mesh_device(request, silicon_arch_name, silicon_arch_wormhole_b0, devic mesh_shape=ttnn.MeshShape(2, 2), dispatch_core_config=dispatch_core_config, **device_params, - offset=(0, 1), + offset=ttnn.MeshOffset(0, 1), mesh_type=ttnn.MeshType.Ring, ) diff --git a/tests/tt_metal/distributed/test_distributed.cpp b/tests/tt_metal/distributed/test_distributed.cpp index 26df7dbcc788..45afeece3be5 100644 --- a/tests/tt_metal/distributed/test_distributed.cpp +++ b/tests/tt_metal/distributed/test_distributed.cpp @@ -46,7 +46,7 @@ TEST(MeshDeviceSuite, Test1x1SystemMeshInitialize) { auto& sys = tt::tt_metal::distributed::SystemMesh::instance(); auto config = - tt::tt_metal::distributed::MeshDeviceConfig({1, 1}, std::pair(0, 0), {}, MeshType::RowMajor); + tt::tt_metal::distributed::MeshDeviceConfig(MeshShape(1, 1), MeshOffset(0, 0), {}, MeshType::RowMajor); EXPECT_NO_THROW({ auto mesh = tt::tt_metal::distributed::MeshDevice::create( diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp index 6ab86fc8c8e3..abecf9445cef 100644 --- a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp +++ b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp @@ -216,7 +216,7 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) { auto view = ttnn::MeshDeviceView(*mesh); std::vector ring_devices = view.get_devices_on_row(0); // Tunnel 0 std::vector ring_devices_1 = - view.get_devices_on_column(mesh_shape.second - 1); // Orthogonal to tunnel .. no deadlocks + view.get_devices_on_column(mesh_shape.num_cols - 1); // Orthogonal to tunnel .. no deadlocks ring_devices_1 = std::vector(ring_devices_1.begin() + 1, ring_devices_1.end()); std::vector ring_devices_2 = view.get_devices_on_row(7); // Tunnel 7 .. potential deadlocks with lack of buffering diff --git a/tt-train/sources/ttml/core/distributed_mapping.hpp b/tt-train/sources/ttml/core/distributed_mapping.hpp index 102240e51e23..d40644486da7 100644 --- a/tt-train/sources/ttml/core/distributed_mapping.hpp +++ b/tt-train/sources/ttml/core/distributed_mapping.hpp @@ -74,7 +74,7 @@ class XTensorToMesh { tt::tt_metal::distributed::MeshShape m_mesh_shape; size_t get_num_devices() const { - return m_mesh_shape.first * m_mesh_shape.second; + return m_mesh_shape.num_rows * m_mesh_shape.num_cols; } }; @@ -130,8 +130,8 @@ class ShardTensor2dMesh : public XTensorToMesh, T> { throw std::invalid_argument("ShardTensor2dMesh requires at least one dimension to shard"); } - int rows = Base::m_mesh_shape.first; - int cols = Base::m_mesh_shape.second; + int rows = Base::m_mesh_shape.num_rows; + int cols = Base::m_mesh_shape.num_cols; auto row_dim = m_dims.first; auto col_dim = m_dims.second; @@ -178,8 +178,8 @@ class ShardTensor2dMesh : public XTensorToMesh, T> { std::unordered_map config_impl() const { return { {"strategy", "shard_2d"}, - {"mesh_shape_y", std::to_string(Base::m_mesh_shape.first)}, - {"mesh_shape_x", std::to_string(Base::m_mesh_shape.second)}}; + {"mesh_shape_y", std::to_string(Base::m_mesh_shape.num_rows)}, + {"mesh_shape_x", std::to_string(Base::m_mesh_shape.num_cols)}}; } private: @@ -193,16 +193,16 @@ class ConcatMesh2dToTensor : public MeshToXTensor, T> { ConcatMesh2dToTensor( tt::tt_metal::distributed::MeshShape mesh_shape, const tt::tt_metal::distributed::MeshShape& dims) : Base(std::move(mesh_shape)), m_dims(dims) { - if (m_dims.first == m_dims.second) { + if (m_dims.num_rows == m_dims.num_cols) { throw std::invalid_argument("Dimensions in 'dims' must be different"); } } std::vector> compose_impl(const std::vector>& tensors) const { - int rows = Base::m_mesh_shape.first; - int cols = Base::m_mesh_shape.second; - size_t row_dim = m_dims.first; - size_t col_dim = m_dims.second; + int rows = Base::m_mesh_shape.num_rows; + int cols = Base::m_mesh_shape.num_cols; + size_t row_dim = m_dims.num_rows; + size_t col_dim = m_dims.num_cols; std::vector> row_concatenated; row_concatenated.reserve(static_cast(rows)); diff --git a/tt-train/tests/core/distributed_test.cpp b/tt-train/tests/core/distributed_test.cpp index e273aaa4973d..0f304788ca38 100644 --- a/tt-train/tests/core/distributed_test.cpp +++ b/tt-train/tests/core/distributed_test.cpp @@ -83,7 +83,7 @@ TYPED_TEST(MeshOpsTest, ShardTensor2dMeshTwoDimSharding) { TYPED_TEST(MeshOpsTest, ReplicateXTensorToMeshReplication) { tt::tt_metal::distributed::MeshShape mesh_shape = {2, 2}; - int num_devices = mesh_shape.first * mesh_shape.second; // 4 + int num_devices = mesh_shape.num_rows * mesh_shape.num_cols; // 4 auto tensor = xt::arange(4); // [0,1,2,3] diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index f2f2d0d9e634..dc0275cd26a2 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -105,7 +105,7 @@ MeshShape SystemMesh::Impl::get_system_mesh_shape(size_t system_num_devices) { TT_FATAL( system_mesh_to_shape.contains(system_num_devices), "Unsupported number of devices: {}", system_num_devices); auto shape = system_mesh_to_shape.at(system_num_devices); - log_debug(LogMetal, "Logical SystemMesh Shape: {}x{}", shape.first, shape.second); + log_debug(LogMetal, "Logical SystemMesh Shape: {}x{}", shape.num_rows, shape.num_cols); return shape; } @@ -293,32 +293,32 @@ std::shared_ptr MeshDevice::create( std::shared_ptr MeshDevice::create_submesh( const MeshShape& submesh_shape, const MeshOffset& offset, MeshType type) { - if (submesh_shape.first <= 0 || submesh_shape.second <= 0) { + if (submesh_shape.num_rows <= 0 || submesh_shape.num_cols <= 0) { TT_THROW( "Invalid submesh shape: ({}, {}). Both dimensions must be positive.", - submesh_shape.first, - submesh_shape.second); + submesh_shape.num_rows, + submesh_shape.num_cols); } - if (offset.first < 0 || offset.second < 0) { - TT_THROW("Invalid offset: ({}, {}). Offset must be non-negative.", offset.first, offset.second); + if (offset.row < 0 || offset.col < 0) { + TT_THROW("Invalid offset: ({}, {}). Offset must be non-negative.", offset.row, offset.col); } - if (offset.first + submesh_shape.first > this->mesh_device_shape.first || - offset.second + submesh_shape.second > this->mesh_device_shape.second) { + if (offset.row + submesh_shape.num_rows > this->mesh_device_shape.num_rows || + offset.col + submesh_shape.num_cols > this->mesh_device_shape.num_cols) { TT_THROW( "Submesh ({}x{}) with offset ({}, {}) does not fit within parent mesh ({}x{}).", - submesh_shape.first, - submesh_shape.second, - offset.first, - offset.second, - this->mesh_device_shape.first, - this->mesh_device_shape.second); + submesh_shape.num_rows, + submesh_shape.num_cols, + offset.row, + offset.col, + this->mesh_device_shape.num_rows, + this->mesh_device_shape.num_cols); } auto submesh = std::make_shared(submesh_shape, type, shared_from_this()); - auto start_coordinate = Coordinate{offset.first, offset.second}; - auto end_coordinate = Coordinate{offset.first + submesh_shape.first - 1, offset.second + submesh_shape.second - 1}; + auto start_coordinate = Coordinate{offset.row, offset.col}; + auto end_coordinate = Coordinate{offset.row + submesh_shape.num_rows - 1, offset.col + submesh_shape.num_cols - 1}; submesh->primary_view = std::make_shared(*this, start_coordinate, end_coordinate); submesh->devices = submesh->primary_view->get_devices(); SystemMesh::instance().register_mesh_device(submesh, submesh->devices); @@ -327,10 +327,10 @@ std::shared_ptr MeshDevice::create_submesh( LogMetal, "Instantiating submesh {}: {}x{} with offset: {} {}", submesh->get_mesh_id(), - submesh_shape.first, - submesh_shape.second, - offset.first, - offset.second); + submesh_shape.num_rows, + submesh_shape.num_cols, + offset.row, + offset.col); log_trace(LogMetal, "Submesh {} instantiated with {} devices", submesh->get_mesh_id(), submesh->devices); return submesh; @@ -338,8 +338,8 @@ std::shared_ptr MeshDevice::create_submesh( std::vector> MeshDevice::create_submeshes(const MeshShape& submesh_shape, MeshType type) { std::vector> submeshes; - for (int row = 0; row < this->num_rows(); row += submesh_shape.first) { - for (int col = 0; col < this->num_cols(); col += submesh_shape.second) { + for (int row = 0; row < this->num_rows(); row += submesh_shape.num_rows) { + for (int col = 0; col < this->num_cols(); col += submesh_shape.num_cols) { auto submesh = this->create_submesh(submesh_shape, MeshOffset{row, col}, type); submeshes.push_back(submesh); } @@ -413,9 +413,9 @@ CoreCoord MeshDevice::dram_grid_size() const { return this->reference_device()-> tt::ARCH MeshDevice::arch() const { return this->reference_device()->arch(); } -size_t MeshDevice::num_rows() const { return this->mesh_device_shape.first; } +size_t MeshDevice::num_rows() const { return this->mesh_device_shape.num_rows; } -size_t MeshDevice::num_cols() const { return this->mesh_device_shape.second; } +size_t MeshDevice::num_cols() const { return this->mesh_device_shape.num_cols; } MeshShape MeshDevice::shape() const { return this->mesh_device_shape; } diff --git a/tt_metal/distributed/mesh_device.hpp b/tt_metal/distributed/mesh_device.hpp index f4370cb7c58d..e7a0ab22db89 100644 --- a/tt_metal/distributed/mesh_device.hpp +++ b/tt_metal/distributed/mesh_device.hpp @@ -18,7 +18,10 @@ namespace tt::tt_metal::distributed { using DeviceIds = std::vector; using MeshDeviceID = size_t; -using MeshOffset = std::pair; +struct MeshOffset { + size_t row = 0; + size_t col = 0; +}; class MeshDeviceView; struct MeshSubDeviceManagerId; diff --git a/tt_metal/distributed/mesh_device_view.cpp b/tt_metal/distributed/mesh_device_view.cpp index b0c2ef050bea..f9e115f0437f 100644 --- a/tt_metal/distributed/mesh_device_view.cpp +++ b/tt_metal/distributed/mesh_device_view.cpp @@ -83,7 +83,7 @@ MeshDeviceView::DeviceView MeshDeviceView::get_devices(const Coordinate& start, } MeshDeviceView::DeviceView MeshDeviceView::get_devices(const MeshShape& shape) { - return get_devices({0, 0}, {shape.first - 1, shape.second - 1}); + return get_devices({0, 0}, {shape.num_rows - 1, shape.num_cols - 1}); } std::vector MeshDeviceView::get_devices_on_row(size_t row) const { @@ -128,7 +128,7 @@ bool MeshDeviceView::empty() const noexcept { return devices_.empty(); } size_t MeshDeviceView::size() const noexcept { return devices_.size(); } -std::pair MeshDeviceView::shape() const noexcept { return {num_rows(), num_cols()}; } +MeshShape MeshDeviceView::shape() const noexcept { return {num_rows(), num_cols()}; } bool MeshDeviceView::contains(const Coordinate& coord) const noexcept { return coord.row >= top_left_.row && coord.row <= bottom_right_.row && coord.col >= top_left_.col && diff --git a/tt_metal/distributed/mesh_device_view.hpp b/tt_metal/distributed/mesh_device_view.hpp index 67bda684ebbb..31af7aba3764 100644 --- a/tt_metal/distributed/mesh_device_view.hpp +++ b/tt_metal/distributed/mesh_device_view.hpp @@ -17,7 +17,10 @@ namespace tt::tt_metal::distributed { // Forward declaration of MeshDevice class MeshDevice; -using MeshShape = std::pair; +struct MeshShape { + size_t num_rows = 0; + size_t num_cols = 0; +}; struct Coordinate { size_t row; diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 26c658482223..fae0eddc6790 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -8,6 +8,7 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/global_semaphore.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/run_operation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/distributed/api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/distributed/distributed_tensor_config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/distributed/distributed_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/graph/graph_processor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/graph/graph_trace_utils.cpp diff --git a/ttnn/cpp/ttnn/distributed/api.cpp b/ttnn/cpp/ttnn/distributed/api.cpp index e4ab3a5ece1a..14aa7085ff54 100644 --- a/ttnn/cpp/ttnn/distributed/api.cpp +++ b/ttnn/cpp/ttnn/distributed/api.cpp @@ -6,8 +6,10 @@ #include +#include "tt_metal/tt_stl/overloaded.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor_utils.hpp" +#include "ttnn/distributed/distributed_tensor_config.hpp" #include "tt_metal/distributed/mesh_device.hpp" using namespace tt::tt_metal; @@ -21,7 +23,7 @@ std::shared_ptr open_mesh_device( size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, MeshType mesh_type, - const std::pair& offset, + const MeshOffset& offset, const std::vector& physical_device_ids) { auto config = MeshDeviceConfig(mesh_shape, offset, physical_device_ids, mesh_type); return MeshDevice::create(config, l1_small_size, trace_region_size, num_command_queues, dispatch_core_config); @@ -58,18 +60,20 @@ std::vector get_device_tensors(const ttnn::Tensor& tensor) { TT_THROW("Expected tensor to be on MultiDeviceHostStorage type!"); } -Tensor aggregate_as_tensor(std::vector& tensor_shards) { +Tensor aggregate_as_tensor( + const std::vector& tensor_shards, const tt::tt_metal::DistributedTensorConfig& config) { TT_ASSERT(tensor_shards.size() > 0, "At least one tensor shard must be provided"); + const auto& reference_shard = tensor_shards.at(0); for (const auto& shard : tensor_shards) { - if (shard.storage_type() != tensor_shards.at(0).storage_type()) { + if (shard.storage_type() != reference_shard.storage_type()) { TT_THROW("All tensor shards must have the same storage type"); } } // Based whether the first tensor shard has OwnedBuffer or Device buffer, // we want to use MultiDeviceHostStorage or MultiDeviceStorage - StorageType storage_type = tensor_shards.at(0).storage_type(); - Tile tile = tensor_shards.at(0).get_tensor_spec().tile(); + StorageType storage_type = reference_shard.storage_type(); + Tile tile = reference_shard.get_tensor_spec().tile(); if (storage_type == StorageType::OWNED) { std::vector shapes; std::vector host_owned_buffers; @@ -81,7 +85,7 @@ Tensor aggregate_as_tensor(std::vector& tensor_shards) { TT_THROW( "Error aggregating multichip tensors: Attempting to aggregate tensors with different tiling " "configurations. Device {} has tiling ({}x{}) while device {} has tiling {}x{}.", - tensor_shards.at(0).device()->id(), + reference_shard.device()->id(), tile.get_height(), tile.get_width(), shard.device()->id(), @@ -89,12 +93,12 @@ Tensor aggregate_as_tensor(std::vector& tensor_shards) { shard_tile.get_width()); } } - auto storage = MultiDeviceHostStorage{AllGatherTensor(), std::move(host_owned_buffers), shapes}; + auto storage = MultiDeviceHostStorage{config, std::move(host_owned_buffers), shapes}; return Tensor( std::move(storage), - tensor_shards.at(0).get_legacy_shape(), - tensor_shards.at(0).get_dtype(), - tensor_shards.at(0).get_layout(), + reference_shard.get_legacy_shape(), + reference_shard.get_dtype(), + reference_shard.get_layout(), tile); } else { std::vector ordered_device_ids; @@ -111,7 +115,7 @@ Tensor aggregate_as_tensor(std::vector& tensor_shards) { TT_THROW( "Error aggregating multichip tensors: Attempting to aggregate tensors with different tiling " "configurations. Device {} has tiling ({}x{}) while device {} has tiling {}x{}.", - tensor_shards.at(0).device()->id(), + reference_shard.device()->id(), tile.get_height(), tile.get_width(), shard.device()->id(), @@ -119,12 +123,12 @@ Tensor aggregate_as_tensor(std::vector& tensor_shards) { shard_tile.get_width()); } } - auto storage = MultiDeviceStorage{AllGatherTensor(), ordered_device_ids, std::move(device_buffers), shapes}; + auto storage = MultiDeviceStorage{config, ordered_device_ids, std::move(device_buffers), shapes}; return Tensor( std::move(storage), - tensor_shards.at(0).get_legacy_shape(), - tensor_shards.at(0).get_dtype(), - tensor_shards.at(0).get_layout(), + reference_shard.get_legacy_shape(), + reference_shard.get_dtype(), + reference_shard.get_layout(), tile); } } @@ -140,7 +144,7 @@ std::vector get_t3k_physical_device_ids_ring() { return physical_device_ids; } -std::vector distribute_tensor_to_mesh(const Tensor& tensor, MeshDevice& mesh_device) { +std::vector get_mapped_devices(const Tensor& tensor, MeshDevice& mesh_device) { // For multi-device tensors, returns the number of workers capped by the number of buffers // Otherwise, returns all available workes from mesh_device. auto get_workers_for_tensor = [&tensor, &mesh_device]() { @@ -151,19 +155,15 @@ std::vector distribute_tensor_to_mesh(const Tensor& tensor, MeshDevice& } return workers; }; - if (mesh_device.get_view() != nullptr and std::holds_alternative(tensor.get_storage())) { const auto& host_storage = std::get(tensor.get_storage()); return std::visit( - [&](const auto& strategy) { - using StrategyType = std::decay_t; - if constexpr (std::is_same_v) { - return mesh_device.get_view()->get_devices(strategy.shard_mesh); - } else { - return get_workers_for_tensor(); - } - }, + tt::stl::overloaded{ + [&](const ShardTensor2D& s) { + return mesh_device.get_view()->get_devices(MeshShape{s.shard_mesh.y, s.shard_mesh.x}); + }, + [&](const auto&) { return get_workers_for_tensor(); }}, host_storage.strategy); } else if (std::holds_alternative(tensor.get_storage())) { return tensor.workers; diff --git a/ttnn/cpp/ttnn/distributed/api.hpp b/ttnn/cpp/ttnn/distributed/api.hpp index cfdd86bba501..23a914a02c92 100644 --- a/ttnn/cpp/ttnn/distributed/api.hpp +++ b/ttnn/cpp/ttnn/distributed/api.hpp @@ -7,6 +7,7 @@ #include #include "ttnn/tensor/tensor.hpp" +#include "ttnn/distributed/distributed_tensor_config.hpp" #include "ttnn/distributed/types.hpp" namespace ttnn::distributed::api { @@ -18,19 +19,22 @@ std::shared_ptr open_mesh_device( size_t num_command_queues, const tt::tt_metal::DispatchCoreConfig& dispatch_core_config, MeshType mesh_type = MeshType::RowMajor, - const std::pair& offset = std::pair(0, 0), + const MeshOffset& offset = MeshOffset(0, 0), const std::vector& physical_device_ids = {}); void close_mesh_device(const std::shared_ptr& mesh_device); +// Given a multi-device tensor, returns a list of individual per-device tensors. std::vector get_device_tensors(const ttnn::Tensor& tensor); -Tensor aggregate_as_tensor(std::vector& tensor_shards); +// Given a list of per-device shards, returns multi-device tensor. +Tensor aggregate_as_tensor( + const std::vector& tensor_shards, const tt::tt_metal::DistributedTensorConfig& config); std::vector get_t3k_physical_device_ids_ring(); // Maps a tensor to the set of devices in the device-mesh that the shards will be distributed across. -std::vector distribute_tensor_to_mesh(const Tensor& tensor, MeshDevice& mesh_device); +std::vector get_mapped_devices(const Tensor& tensor, MeshDevice& mesh_device); // Get the distributed tensor config from a tensor. tt::tt_metal::DistributedTensorConfig get_distributed_tensor_config_from_tensor(const Tensor& tensor); diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp index ec7e8e4691f7..43ac6aa3574b 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp +++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "ttnn/distributed/distributed_pybind.hpp" +#include #include "ttnn/distributed/api.hpp" #include "ttnn/tensor/tensor_utils.hpp" @@ -20,6 +21,8 @@ namespace py = pybind11; void py_module_types(py::module& module) { py::class_>(module, "MeshDevice"); py::class_(module, "MeshSubDeviceManagerId"); + py::class_(module, "MeshShape", "Struct representing the shape of a mesh device."); + py::class_(module, "MeshOffset", "Struct representing the offset of a mesh device."); } void py_module(py::module& module) { @@ -28,6 +31,37 @@ void py_module(py::module& module) { .value("Ring", MeshType::Ring) .value("Line", MeshType::Line) .export_values(); + + static_cast>(module.attr("MeshShape")) + .def( + py::init([](size_t num_rows, size_t num_cols) { return MeshShape(num_rows, num_cols); }), + "Constructor with specified number of rows and columns.", + py::arg("num_rows"), + py::arg("num_cols")) + .def_readwrite("num_rows", &MeshShape::num_rows, "Number of rows in the mesh.") + .def_readwrite("num_cols", &MeshShape::num_cols, "Number of columns in the mesh.") + .def( + "__repr__", + [](const MeshShape& ms) { + return ""; + }) + .def("__iter__", [](const MeshShape& ms) { return py::iter(py::make_tuple(ms.num_rows, ms.num_cols)); }); + static_cast>(module.attr("MeshOffset")) + .def( + py::init([](size_t row, size_t col) { return MeshOffset(row, col); }), + "Constructor with specified row and column offsets.", + py::arg("row"), + py::arg("col")) + .def_readwrite("row", &MeshOffset::row, "Row offset in the mesh.") + .def_readwrite("col", &MeshOffset::col, "Column offset in the mesh.") + .def( + "__repr__", + [](const MeshOffset& mo) { + return ""; + }) + .def("__iter__", [](const MeshOffset& mo) { return py::iter(py::make_tuple(mo.row, mo.col)); }); + auto py_mesh_device = static_cast>>(module.attr("MeshDevice")); py_mesh_device .def( @@ -36,7 +70,7 @@ void py_module(py::module& module) { size_t trace_region_size, size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, - const std::pair& offset, + const MeshOffset& offset, const std::vector& physical_device_ids, MeshType mesh_type) { return MeshDevice::create( @@ -134,7 +168,10 @@ void py_module(py::module& module) { R"doc( Disable program cache across all devices in the mesh. )doc") - .def_property_readonly("shape", &MeshDevice::shape, R"doc( + .def_property_readonly( + "shape", + &MeshDevice::shape, + R"doc( Get the shape of the device mesh. Returns: @@ -193,7 +230,6 @@ void py_module(py::module& module) { py::arg("l1_small_size"), py::arg("trace_region_size"), py::arg("num_command_queues"), - py::arg("offset"), py::arg("physical_device_ids"), py::arg("mesh_type"), @@ -233,7 +269,11 @@ void py_module(py::module& module) { Tensor: The shard of the tensor corresponding to the device. )doc"); module.def("get_device_tensors", &get_device_tensors, py::arg("tensor"), py::kw_only()); - module.def("aggregate_as_tensor", &aggregate_as_tensor, py::arg("tensors"), py::kw_only()); + module.def( + "aggregate_as_tensor", + [](const std::vector& tensors) -> Tensor { return aggregate_as_tensor(tensors, AllGatherTensor{}); }, + py::arg("tensors"), + py::kw_only()); module.def("get_t3k_physical_device_ids_ring", &get_t3k_physical_device_ids_ring); } diff --git a/ttnn/cpp/ttnn/distributed/distributed_tensor_config.cpp b/ttnn/cpp/ttnn/distributed/distributed_tensor_config.cpp new file mode 100644 index 000000000000..9ae22852fd55 --- /dev/null +++ b/ttnn/cpp/ttnn/distributed/distributed_tensor_config.cpp @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "common/assert.hpp" +#include "ttnn/distributed/distributed_tensor_config.hpp" + +namespace tt::tt_metal { +namespace { + +DistributedTensorConfig create_shard_distributed_tensor_config( + const std::unordered_map& metadata) { + return ShardTensor(std::stoi(metadata.at("shard_dim"))); +} +DistributedTensorConfig create_shard_2d_distributed_tensor_config( + const std::unordered_map& metadata) { + return ShardTensor2D(ShardMesh(std::stoi(metadata.at("mesh_shape_y")), std::stoi(metadata.at("mesh_shape_x")))); +} +DistributedTensorConfig create_replicate_distributed_tensor_config( + const std::unordered_map& metadata) { + if (auto it = metadata.find("replication_factor"); it != metadata.end()) { + return ReplicateTensor(std::stoi(it->second)); + } + TT_THROW("Unsupported Replication strategy:"); +} +} // namespace + +DistributedTensorConfig get_distributed_tensor_config(const std::unordered_map& metadata) { + if (auto it = metadata.find("strategy"); it != metadata.end()) { + const std::string& strategy = it->second; + if (strategy == "shard") { + return create_shard_distributed_tensor_config(metadata); + } else if (strategy == "shard_2d") { + return create_shard_2d_distributed_tensor_config(metadata); + } else if (strategy == "replicate") { + return create_replicate_distributed_tensor_config(metadata); + } + } + TT_THROW("Unsupported DistributedTensorConfig strategy:"); +} + +bool operator==(const ReplicateTensor& a, const ReplicateTensor& b) { + return a.replication_factor == b.replication_factor; +} +bool operator==(const AllGatherTensor&, const AllGatherTensor&) { + // All instances are considered equal because there are no data members. + return true; +} +bool operator==(const ShardTensor& lhs, const ShardTensor& rhs) { return lhs.shard_dimension == rhs.shard_dimension; } +bool operator==(const ShardTensor2D& lhs, const ShardTensor2D& rhs) { + return lhs.shard_mesh.x == rhs.shard_mesh.x && lhs.shard_mesh.y == rhs.shard_mesh.y; +} + +} // namespace tt::tt_metal diff --git a/ttnn/cpp/ttnn/distributed/distributed_tensor_config.hpp b/ttnn/cpp/ttnn/distributed/distributed_tensor_config.hpp new file mode 100644 index 000000000000..5f67262028ed --- /dev/null +++ b/ttnn/cpp/ttnn/distributed/distributed_tensor_config.hpp @@ -0,0 +1,43 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +namespace tt::tt_metal { + +struct ReplicateTensor { + int replication_factor = 1; + ReplicateTensor() = default; + ReplicateTensor(int replication_factor) : replication_factor(replication_factor) {} +}; +bool operator==(const ReplicateTensor&, const ReplicateTensor&); +struct ShardTensor { + int shard_dimension; + ShardTensor(int shard_dimension) : shard_dimension(shard_dimension) {} +}; +bool operator==(const ShardTensor& lhs, const ShardTensor& rhs); + +struct ShardMesh { + std::uint16_t y = 0; + std::uint16_t x = 0; +}; + +struct ShardTensor2D { + ShardMesh shard_mesh; // logic 2D grid that defines the mapping of shards to devices + ShardTensor2D(ShardMesh mesh) : shard_mesh(std::move(mesh)) {} +}; +bool operator==(const ShardTensor2D& lhs, const ShardTensor2D& rhs); + +struct AllGatherTensor {}; +bool operator==(const AllGatherTensor&, const AllGatherTensor&); + +// DistributedTensorConfig is a variant of different ways in which a tensor can be distributed across devices. +using DistributedTensorConfig = std::variant; +DistributedTensorConfig get_distributed_tensor_config(const std::unordered_map& metadata); + +} // namespace tt::tt_metal diff --git a/ttnn/cpp/ttnn/distributed/types.hpp b/ttnn/cpp/ttnn/distributed/types.hpp index 557d10c90ec3..bdb17e71ac1f 100644 --- a/ttnn/cpp/ttnn/distributed/types.hpp +++ b/ttnn/cpp/ttnn/distributed/types.hpp @@ -13,6 +13,7 @@ namespace ttnn::distributed { using MeshShape = tt::tt_metal::distributed::MeshShape; +using MeshOffset = tt::tt_metal::distributed::MeshOffset; using DeviceIds = tt::tt_metal::distributed::DeviceIds; using MeshDevice = tt::tt_metal::distributed::MeshDevice; using MeshDeviceView = tt::tt_metal::distributed::MeshDeviceView; @@ -29,6 +30,7 @@ using ttnn::distributed::DeviceIds; using ttnn::distributed::MeshDevice; using ttnn::distributed::MeshDeviceConfig; using ttnn::distributed::MeshDeviceView; +using ttnn::distributed::MeshOffset; using ttnn::distributed::MeshShape; using ttnn::distributed::MeshSubDeviceManagerId; using ttnn::distributed::MeshType; diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp index d6c80e1e92d4..c6d65bc0aec5 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor.cpp @@ -568,7 +568,7 @@ Tensor Tensor::to(Device* target_device, const MemoryConfig& mem_config) const { } Tensor Tensor::to(distributed::MeshDevice* mesh_device, const MemoryConfig& mem_config) const { - std::vector workers_to_use = ttnn::distributed::distribute_tensor_to_mesh(*this, *mesh_device); + std::vector workers_to_use = ttnn::distributed::get_mapped_devices(*this, *mesh_device); return tensor_ops::tensor_to(*this, workers_to_use, mem_config); } diff --git a/ttnn/cpp/ttnn/tensor/tensor.hpp b/ttnn/cpp/ttnn/tensor/tensor.hpp index 7a2976ac8f22..bf86cca99c19 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor.hpp @@ -16,6 +16,7 @@ #include "common/test_tiles.hpp" #include "common/tt_backend_api_types.hpp" #include "ttnn/common/constants.hpp" +#include "ttnn/distributed/distributed_tensor_config.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/tensor/tensor_spec.hpp" #include "ttnn/tensor/layout/tensor_layout.hpp" diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp index 8a46d676dc79..460f8b0d5dbc 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp @@ -181,15 +181,15 @@ Tensor tensor_to(const Tensor& input_tensor, Layout target_layout, distributed:: ZoneScoped; GraphTracker::instance().track_function_start("Tensor::to", input_tensor, target_layout, mesh_device); if (mesh_device) { - auto workers = ttnn::distributed::distribute_tensor_to_mesh(input_tensor, *mesh_device); + auto workers = ttnn::distributed::get_mapped_devices(input_tensor, *mesh_device); TT_FATAL( validate_worker_modes(workers), "All device threads/workers must be running in the same mode (ASYNC or SYNC)"); std::optional distributed_config = std::nullopt; - if (std::holds_alternative(input_tensor.get_storage())) { - auto& host_storage = std::get(input_tensor.get_storage()); - distributed_config = host_storage.strategy; + if (auto* host_storage = std::get_if(&input_tensor.get_storage()); + host_storage != nullptr) { + distributed_config = host_storage->strategy; } Tensor tensor_modified_layout = Tensor(workers.size(), distributed_config); for (int worker_index = 0; worker_index < workers.size(); ++worker_index) { diff --git a/ttnn/cpp/ttnn/tensor/tensor_utils.hpp b/ttnn/cpp/ttnn/tensor/tensor_utils.hpp index 96ce34431b93..3c2565299b96 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_utils.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_utils.hpp @@ -142,40 +142,6 @@ void insert_buffer_and_shape_for_device( Tensor copy_borrowed_tensor_in_async_mode(Device* worker, const Tensor& tensor); -template -auto get_device_tensors(Device* device, const TensorContainer& input_tensors) { - // Could be Tensor, const Tensor, std::optional, or std::optional - using ValueType = typename TensorContainer::value_type; - - // We need a way to extract the underlying Tensor type (const or non-const) from ValueType - // and to decide whether we are dealing with an optional type. - using IsOptional = std::conditional_t< - std::is_same_v> || std::is_same_v>, - std::true_type, - std::false_type>; - using TensorType = std::conditional_t< - std::is_same_v> || std::is_same_v, - Tensor, - const Tensor>; - - // Result container type adjustment based on input type - using ResultType = std::conditional_t, TensorType>; - std::vector transformed_tensors; - - for (const auto& tensor : input_tensors) { - if constexpr (IsOptional::value) { - if (tensor.has_value()) { - transformed_tensors.emplace_back(get_device_tensor(tensor.value(), device)); - } else { - transformed_tensors.emplace_back(std::nullopt); - } - } else { - transformed_tensors.emplace_back(get_device_tensor(tensor, device)); - } - } - return transformed_tensors; -} - inline bool is_tensor_on_device(const ttnn::Tensor& tensor) { return tensor.storage_type() == StorageType::DEVICE; } inline bool is_tensor_on_multi_device(const ttnn::Tensor& tensor) { @@ -196,5 +162,4 @@ inline uint32_t get_batch_size(const T& shape) { } } // namespace tt_metal - } // namespace tt diff --git a/ttnn/cpp/ttnn/tensor/types.cpp b/ttnn/cpp/ttnn/tensor/types.cpp index ccb861007181..6ba3893c8a1f 100644 --- a/ttnn/cpp/ttnn/tensor/types.cpp +++ b/ttnn/cpp/ttnn/tensor/types.cpp @@ -6,9 +6,7 @@ #include "ttnn/tensor/types.hpp" #include "ttnn/tensor/tensor_impl.hpp" -namespace ttnn { - -namespace types { +namespace ttnn::types { const Shape Shape::to_rank(size_t new_rank) const { auto padded_shape = value; @@ -31,42 +29,10 @@ const Shape Shape::to_rank(size_t new_rank) const { return Shape(std::move(new_shape), std::move(new_padded_shape)); } -} // namespace types - -} // namespace ttnn +} // namespace ttnn::types namespace tt::tt_metal { -static DistributedTensorConfig create_shard_distributed_tensor_config( - const std::unordered_map& metadata) { - return ShardTensor(std::stoi(metadata.at("shard_dim"))); -} -static DistributedTensorConfig create_shard_2d_distributed_tensor_config( - const std::unordered_map& metadata) { - return ShardTensor2D(ShardMesh(std::stoi(metadata.at("mesh_shape_y")), std::stoi(metadata.at("mesh_shape_x")))); -} -static DistributedTensorConfig create_replicate_distributed_tensor_config( - const std::unordered_map& metadata) { - if (auto it = metadata.find("replication_factor"); it != metadata.end()) { - return ReplicateTensor(std::stoi(it->second)); - } - TT_THROW("Unsupported Replication strategy:"); -} - -DistributedTensorConfig get_distributed_tensor_config(const std::unordered_map& metadata) { - if (auto it = metadata.find("strategy"); it != metadata.end()) { - const std::string& strategy = it->second; - if (strategy == "shard") { - return create_shard_distributed_tensor_config(metadata); - } else if (strategy == "shard_2d") { - return create_shard_2d_distributed_tensor_config(metadata); - } else if (strategy == "replicate") { - return create_replicate_distributed_tensor_config(metadata); - } - } - TT_THROW("Unsupported DistributedTensorConfig strategy:"); -} - tt::DataFormat datatype_to_dataformat_converter(tt::tt_metal::DataType datatype) { switch (datatype) { case tt::tt_metal::DataType::BFLOAT16: return tt::DataFormat::Float16_b; @@ -218,20 +184,6 @@ Array4D LegacyShape::to_array_4D() const { return ret_array; } -bool operator==(const ReplicateTensor& a, const ReplicateTensor& b) { - return a.replication_factor == - b.replication_factor; // All instances are considered equal because there are no data members. -} -bool operator==(const AllGatherTensor&, const AllGatherTensor&) { - return true; // All instances are considered equal because there are no data members. -} -bool operator==(const ShardTensor& lhs, const ShardTensor& rhs) { - return lhs.shard_dimension == rhs.shard_dimension; // Equal if they have the same shard_dimension. -} -bool operator==(const ShardTensor2D& lhs, const ShardTensor2D& rhs) { - return lhs.shard_mesh == rhs.shard_mesh; // Equal if they have the same shard_mesh. -} - bool operator==(const tt::tt_metal::LegacyShape& shape_a, const tt::tt_metal::LegacyShape& shape_b) { if (shape_a.rank() != shape_b.rank()) { return false; diff --git a/ttnn/cpp/ttnn/tensor/types.hpp b/ttnn/cpp/ttnn/tensor/types.hpp index ed8c8a95b145..58299ece07bd 100644 --- a/ttnn/cpp/ttnn/tensor/types.hpp +++ b/ttnn/cpp/ttnn/tensor/types.hpp @@ -18,6 +18,7 @@ #include "tt_metal/tt_stl/concepts.hpp" #include "tt_metal/tt_stl/reflection.hpp" #include "tt_metal/tt_stl/span.hpp" +#include "ttnn/distributed/distributed_tensor_config.hpp" #include "ttnn/tensor/host_buffer/types.hpp" #include "ttnn/cpp/ttnn/tensor/enum_types.hpp" @@ -78,31 +79,6 @@ enum class StorageType { MULTI_DEVICE_HOST, // host storage for multi-device context }; -struct AllGatherTensor {}; -bool operator==(const AllGatherTensor &, const AllGatherTensor &); -struct ReplicateTensor { - int replication_factor = 1; - ReplicateTensor() = default; - ReplicateTensor(int replication_factor) : replication_factor(replication_factor) {} -}; -bool operator==(const ReplicateTensor &, const ReplicateTensor &); -struct ShardTensor { - int shard_dimension; - ShardTensor(int shard_dimension) : shard_dimension(shard_dimension) {} -}; -bool operator==(const ShardTensor &lhs, const ShardTensor &rhs); - -using ShardMesh = std::pair; // (y,x) -struct ShardTensor2D { - ShardMesh shard_mesh; // logic 2D grid that defines the mapping of shards to devices - ShardTensor2D(ShardMesh mesh) : shard_mesh(std::move(mesh)) {} -}; -bool operator==(const ShardTensor2D &lhs, const ShardTensor2D &rhs); - -// DistributedTensorConfig is a variant of different ways in which a tensor can be distributed across devices. -using DistributedTensorConfig = std::variant; -DistributedTensorConfig get_distributed_tensor_config(const std::unordered_map &metadata); - tt::DataFormat datatype_to_dataformat_converter(DataType datatype); static constexpr std::size_t MAX_NUM_DIMENSIONS = 8; diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py index b699ea556af6..8a968ea63ebb 100644 --- a/ttnn/ttnn/__init__.py +++ b/ttnn/ttnn/__init__.py @@ -153,6 +153,7 @@ def manage_config(name, value): WormholeComputeKernelConfig, GrayskullComputeKernelConfig, MeshShape, + MeshOffset, UnaryWithParam, UnaryOpType, BinaryOpType, diff --git a/ttnn/ttnn/distributed/distributed.py b/ttnn/ttnn/distributed/distributed.py index a4f329eb58a3..bda36c5b0f5b 100644 --- a/ttnn/ttnn/distributed/distributed.py +++ b/ttnn/ttnn/distributed/distributed.py @@ -139,7 +139,7 @@ def open_mesh_device( trace_region_size: int = ttnn._ttnn.device.DEFAULT_TRACE_REGION_SIZE, num_command_queues: int = 1, dispatch_core_config: ttnn.DispatchCoreConfig = ttnn.DispatchCoreConfig(), - offset: Tuple[int, int] = (0, 0), + offset: ttnn.MeshOffset = ttnn.MeshOffset(row=0, col=0), physical_device_ids: List[int] = [], mesh_type: "MeshType" = MeshType.RowMajor, ): @@ -152,7 +152,8 @@ def open_mesh_device( trace_region_size (int, optional): Size of the trace region. Defaults to ttnn._ttnn.device.DEFAULT_TRACE_REGION_SIZE. num_command_queues (int, optional): Number of command queues. Defaults to 1. dispatch_core_type (int, optional): Type of dispatch core. Defaults to DispatchCoreType.WORKER. - offset (Tuple[int, int], optional): Offset in logical mesh coordinates for the mesh device. Defaults to (0, 0). + offset (ttnn.MeshOffset, optional): Offset in logical mesh coordinates for the mesh device. Defaults to (0, 0). + physical_device_ids (List[int], optional): List of physical device IDs to use. Defaults to []. mesh_type (MeshType, optional): Defines type of mesh requested. Type imposes connectivity constraints and defines device iteration order. Returns: @@ -160,7 +161,7 @@ def open_mesh_device( """ return ttnn._ttnn.multi_device.MeshDevice( - mesh_shape=mesh_shape.as_tuple(), + mesh_shape=mesh_shape, l1_small_size=l1_small_size, trace_region_size=trace_region_size, num_command_queues=num_command_queues, @@ -455,6 +456,8 @@ def compose(self, tensor: ttnn.Tensor) -> "torch.Tensor": return torch.cat(device_shards_converted_to_torch, dim=self.concat_dim) +# TODO: #15061 - Remove this function, as it does not abide to the MeshToTensor interface. +# Instead, lift this implementation to the caller. class ListMeshToTensor(MeshToTensor): def __init__(self, mesh_device: MeshDevice): self.mesh_device = mesh_device diff --git a/ttnn/ttnn/types.py b/ttnn/ttnn/types.py index 0fd3f775313d..b210fe90f5ff 100644 --- a/ttnn/ttnn/types.py +++ b/ttnn/ttnn/types.py @@ -58,25 +58,14 @@ class CoreRange: end: CoreGrid -@dataclasses.dataclass -class MeshShape: - y: int - x: int - - @property - def num_devices(self): - return self.y * self.x - - def as_tuple(self): - return (self.y, self.x) - - class ShardStrategy(Enum): HEIGHT = 1 WIDTH = 2 BLOCK = 3 +MeshShape = ttnn._ttnn.multi_device.MeshShape +MeshOffset = ttnn._ttnn.multi_device.MeshOffset ShardOrientation = ttnn._ttnn.tensor.ShardOrientation ShardMode = ttnn._ttnn.tensor.ShardMode ShardSpec = ttnn._ttnn.tensor.ShardSpec From 8917d6b0a5d3db15de3339d92a1b98befabc6cff Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Mon, 9 Dec 2024 20:55:34 -0500 Subject: [PATCH 21/59] #15833: Handle copying multi-device host storage to 1x1 mesh device (#15845) ### Ticket #15833 ### Problem description See #15833. There is a mismatch with how device tensor is handling 1x1 mesh, as opposed to host tensor for 1x1 mesh. This is a temporary local fix! #15840 tracks a proper fix. ### What's changed Handle copying tensors from 1x1 host storage to a single device. ### Checklist - [X] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12244129605) - [X] [T3K unit + frequent tests](https://github.com/tenstorrent/tt-metal/actions/runs/12246012734) - [X] New/Existing tests provide coverage for changes --- .../tensor/test_tensor_prealloc_and_write.py | 81 ++++++++++----- ttnn/cpp/ttnn/tensor/tensor.cpp | 99 ++++++++++--------- 2 files changed, 113 insertions(+), 67 deletions(-) diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py b/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py index a8418376d26d..68df7937879c 100644 --- a/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py +++ b/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py @@ -11,39 +11,76 @@ from models.utility_functions import is_grayskull +@pytest.mark.parametrize("shape", [(1, 10, 64, 96), (32, 1, 64, 64), (32, 3, 256, 256), (16, 1, 1024, 1024)]) @pytest.mark.parametrize("in_dtype", [ttnn.bfloat16, ttnn.bfloat8_b]) @pytest.mark.parametrize("mem_layout", [ttnn.TensorMemoryLayout.INTERLEAVED]) @pytest.mark.parametrize("memory_location", [ttnn.BufferType.L1, ttnn.BufferType.DRAM]) @pytest.mark.parametrize("tensor_layout", [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]) -@pytest.mark.parametrize( - "enable_async, num_loops", - ((True, 5), (False, 5)), -) +@pytest.mark.parametrize("enable_async", (False, True)) def test_tensor_preallocation_and_write_apis( - num_loops, enable_async, in_dtype, mem_layout, memory_location, tensor_layout, device + enable_async, shape, in_dtype, mem_layout, memory_location, tensor_layout, device ): if in_dtype == ttnn.bfloat8_b and tensor_layout == ttnn.ROW_MAJOR_LAYOUT: pytest.skip("Row Major Layout not supported for Bfp8") torch.manual_seed(0) device.enable_async(enable_async) + + # Preallocate tensor on device + preallocated_tensor = ttnn.allocate_tensor_on_device( + ttnn.Shape(shape), + in_dtype, + tensor_layout, + device, + ttnn.MemoryConfig(memory_layout=mem_layout, buffer_type=memory_location), + ) + for loop in range(5): + # Write to prreallocated tensor multiple times + input_tensor_a = torch.randn(shape).bfloat16() + tt_input_tensor_a = ttnn.Tensor(input_tensor_a, in_dtype).to(tensor_layout) + ttnn.copy_host_to_device_tensor(tt_input_tensor_a, preallocated_tensor) + readback = preallocated_tensor.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch() + allclose, output = comp_pcc(readback, input_tensor_a) + assert allclose, f"FAILED: {output}" + + +@pytest.mark.parametrize("shape", [(1, 10, 64, 96), (32, 3, 256, 256)]) +@pytest.mark.parametrize("in_dtype", [ttnn.bfloat16, ttnn.bfloat8_b]) +@pytest.mark.parametrize("mem_layout", [ttnn.TensorMemoryLayout.INTERLEAVED]) +@pytest.mark.parametrize("memory_location", [ttnn.BufferType.L1, ttnn.BufferType.DRAM]) +@pytest.mark.parametrize("tensor_layout", [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT]) +@pytest.mark.parametrize("enable_async", (False, True)) +@pytest.mark.parametrize("mesh_device", ((1, 1), 4), indirect=True) +def test_tensor_preallocation_and_write_apis( + enable_async, shape, in_dtype, mem_layout, memory_location, tensor_layout, mesh_device +): + if in_dtype == ttnn.bfloat8_b and tensor_layout == ttnn.ROW_MAJOR_LAYOUT: + pytest.skip("Row Major Layout not supported for Bfp8") + torch.manual_seed(0) + mesh_device.enable_async(enable_async) shapes = [(1, 10, 64, 96), (32, 1, 64, 64), (32, 3, 256, 256), (16, 1, 1024, 1024)] - for tensor_shape in shapes: - # Preallocate tensor on device - preallocated_tensor = ttnn.allocate_tensor_on_device( - ttnn.Shape(tensor_shape), - in_dtype, - tensor_layout, - device, - ttnn.MemoryConfig(memory_layout=mem_layout, buffer_type=memory_location), + # Preallocate tensor on device + preallocated_tensor = ttnn.allocate_tensor_on_device( + ttnn.Shape(shape), + in_dtype, + tensor_layout, + mesh_device, + ttnn.MemoryConfig(memory_layout=mem_layout, buffer_type=memory_location), + ) + for loop in range(5): + # Write to prreallocated tensor multiple times + input_tensor_a = torch.randn(shape).bfloat16() + tt_input_tensor_a = ttnn.from_torch( + input_tensor_a, + dtype=in_dtype, + layout=tensor_layout, + mesh_mapper=ttnn.ReplicateTensorToMesh(mesh_device), + ) + ttnn.copy_host_to_device_tensor(tt_input_tensor_a, preallocated_tensor) + readback_tensors = ttnn.to_torch( + preallocated_tensor.cpu().to(ttnn.ROW_MAJOR_LAYOUT), + mesh_composer=ttnn.ListMeshToTensor(mesh_device), ) - for loop in range(num_loops): - # Write to prreallocated tensor multiple times - input_tensor_a = torch.randn(tensor_shape).bfloat16() - tt_input_tensor_a = ttnn.Tensor(input_tensor_a, in_dtype).to(tensor_layout) - ttnn.copy_host_to_device_tensor(tt_input_tensor_a, preallocated_tensor) - readback = preallocated_tensor.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch() - allclose, output = comp_pcc(readback, input_tensor_a) + for readback_tensor in readback_tensors: + allclose, output = comp_pcc(readback_tensor, input_tensor_a) assert allclose, f"FAILED: {output}" - - device.enable_async(False) diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp index c6d65bc0aec5..6f153e8e6b41 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor.cpp @@ -656,21 +656,12 @@ std::vector Tensor::host_page_ordering() { StorageType Tensor::storage_type() const { return std::visit( - [](auto&& storage) -> StorageType { - using T = std::decay_t; - if constexpr (std::is_same_v) { - return StorageType::OWNED; - } else if constexpr (std::is_same_v) { - return StorageType::DEVICE; - } else if constexpr (std::is_same_v) { - return StorageType::BORROWED; - } else if constexpr (std::is_same_v) { - return StorageType::MULTI_DEVICE; - } else if constexpr (std::is_same_v) { - return StorageType::MULTI_DEVICE_HOST; - } else { - raise_unsupported_storage(); - } + tt::stl::overloaded{ + [](const OwnedStorage&) { return StorageType::OWNED; }, + [](const DeviceStorage&) { return StorageType::DEVICE; }, + [](const BorrowedStorage&) { return StorageType::BORROWED; }, + [](const MultiDeviceStorage& s) { return StorageType::MULTI_DEVICE; }, + [](const MultiDeviceHostStorage&) { return StorageType::MULTI_DEVICE_HOST; }, }, this->get_storage()); } @@ -873,18 +864,20 @@ Tensor allocate_tensor_on_devices( void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id) { // Top level wrapper to copy a host tensor to a preallocated device tensor TT_ASSERT(device_tensor.workers.size(), "Workers must be specified for device_tensor in write_tensor"); + Tensor async_safe_tensor = copy_borrowed_tensor_in_async_mode(device_tensor.workers.at(0), host_tensor); + TT_FATAL( + async_safe_tensor.storage_type() == StorageType::BORROWED or + async_safe_tensor.storage_type() == StorageType::OWNED or + async_safe_tensor.storage_type() == StorageType::MULTI_DEVICE_HOST, + "write_tensor only supports host_tensor to device_tensor data transfer"); + uint32_t host_tensor_ref_count = async_safe_tensor.tensor_attributes->record_main_thread_ref_count(); uint32_t device_tensor_ref_count = device_tensor.tensor_attributes->record_main_thread_ref_count(); for (int worker_index = 0; worker_index < device_tensor.workers.size(); ++worker_index) { auto& worker = device_tensor.workers[worker_index]; worker->push_work([cq_id, worker, worker_index, async_safe_tensor, device_tensor]() mutable { - TT_FATAL( - async_safe_tensor.storage_type() == StorageType::BORROWED or - async_safe_tensor.storage_type() == StorageType::OWNED or - async_safe_tensor.storage_type() == StorageType::MULTI_DEVICE_HOST, - "write_tensor only supports host_tensor to device_tensor data transfer"); TT_FATAL( device_tensor.storage_type() == StorageType::DEVICE or device_tensor.storage_type() == StorageType::MULTI_DEVICE, @@ -895,33 +888,49 @@ void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id async_safe_tensor.get_tensor_spec().page_config() == device_tensor.get_tensor_spec().page_config(), "Error"); std::visit( - [worker_index, worker, cq_id, &async_safe_tensor](auto&& s) { - void* host_data = nullptr; - using StorageType = std::decay_t; - if constexpr (std::is_same_v) { - if (std::holds_alternative(async_safe_tensor.get_storage())) { - // Handle case when writing borrowed tensor single device tensor (only allowed for sync - // mode) - auto host_storage = std::get(async_safe_tensor.get_storage()); - std::visit([&host_data](auto&& b) { host_data = b.data(); }, host_storage.buffer); - } else { - TT_ASSERT( - std::holds_alternative(async_safe_tensor.get_storage()), - "Unexpected type {}", - tt::stl::get_active_type_name_in_variant(async_safe_tensor.get_storage())); - auto host_storage = std::get(async_safe_tensor.get_storage()); - std::visit([&host_data](auto&& b) { host_data = b.begin(); }, host_storage.get_buffer()); - } - EnqueueWriteBuffer(worker->command_queue(cq_id), s.get_buffer(), host_data, false); - } else if constexpr (std::is_same_v) { + tt::stl::overloaded{ + [worker, worker_index, cq_id, &async_safe_tensor](const DeviceStorage& device_storage) { + // Copying from host to a single device. + void* host_data = std::visit( + tt::stl::overloaded{ + [](BorrowedStorage s) { + return std::visit([](auto&& b) { return b.data(); }, s.buffer); + }, + [](OwnedStorage s) { + return std::visit([](auto&& b) { return static_cast(b.begin()); }, s.buffer); + }, + [](const MultiDeviceHostStorage& host_storage) { + TT_ASSERT( + host_storage.num_buffers() == 1, + "Cannot copy multi-buffer host storage to a single device"); + return std::visit( + [](auto&& b) -> void* { return b.begin(); }, host_storage.get_buffer(0)); + }, + [](auto&&) -> void* { TT_THROW("Unreachable"); }, + }, + async_safe_tensor.get_storage()); + EnqueueWriteBuffer( + worker->command_queue(cq_id), + device_storage.get_buffer(), + host_data, + /*blocking=*/false); + }, + [worker, worker_index, cq_id, &async_safe_tensor](const MultiDeviceStorage& device_storage) { + // Copying from host to multi-device. + TT_ASSERT( + std::holds_alternative(async_safe_tensor.get_storage()), + "Unexpected type {}", + tt::stl::get_active_type_name_in_variant(async_safe_tensor.get_storage())); auto host_storage = std::get(async_safe_tensor.get_storage()); - std::visit( - [worker_index, &host_data](auto&& b) { host_data = b.begin(); }, - host_storage.get_buffer(worker_index)); + void* host_data = std::visit( + [](auto&& b) -> void* { return b.begin(); }, host_storage.get_buffer(worker_index)); EnqueueWriteBuffer( - worker->command_queue(cq_id), s.get_buffer_for_device(worker), host_data, false); - } - }, + worker->command_queue(cq_id), + device_storage.get_buffer_for_device(worker), + host_data, + /*blocking=*/false); + }, + [](auto&& s) { TT_THROW("Unreachable"); }}, device_tensor.get_storage()); }); } From c2d7b099da7b194475ca7e6cf18c2eff496f866c Mon Sep 17 00:00:00 2001 From: Kartik Paigwar <132708568+kpaigwar@users.noreply.github.com> Date: Mon, 9 Dec 2024 21:08:26 -0500 Subject: [PATCH 22/59] #0: added api for generating corerangeset from given subcoregrid (#15841) --- ...m_cores_to_corerangeset_in_subcoregrids.py | 93 ++++++++++++++ tt_metal/common/work_split.cpp | 117 ++++++++++++++++++ tt_metal/common/work_split.hpp | 5 + ttnn/cpp/pybind11/operations/core.hpp | 6 + ttnn/ttnn/__init__.py | 1 + ttnn/ttnn/core.py | 17 +++ 6 files changed, 239 insertions(+) create mode 100644 tests/tt_eager/python_api_testing/unit_testing/misc/test_num_cores_to_corerangeset_in_subcoregrids.py diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_num_cores_to_corerangeset_in_subcoregrids.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_num_cores_to_corerangeset_in_subcoregrids.py new file mode 100644 index 000000000000..0840e55513db --- /dev/null +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_num_cores_to_corerangeset_in_subcoregrids.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import ttnn +import pytest + + +@pytest.mark.parametrize( + "start_core, num_cores, sub_core_grids, row_wise, expected_core_range_set", + [ + # Test Case 1: Basic row-wise scenario with enough cores in sub_core_grids + ( + ttnn.CoreCoord(1, 0), + 32, + ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(3, 9)), + ttnn.CoreRange(ttnn.CoreCoord(5, 0), ttnn.CoreCoord(6, 9)), + ] + ), + True, + ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(3, 9)), + ttnn.CoreRange(ttnn.CoreCoord(5, 0), ttnn.CoreCoord(6, 0)), + ] + ), + ), + # Test Case 2: Basic Column-wise processing + ( + ttnn.CoreCoord(1, 0), + 32, + ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(3, 9)), + ttnn.CoreRange(ttnn.CoreCoord(5, 0), ttnn.CoreCoord(6, 9)), + ] + ), + False, + ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(3, 9)), + ttnn.CoreRange(ttnn.CoreCoord(5, 0), ttnn.CoreCoord(5, 1)), + ] + ), + ), + # Test Case 3: row-wise scenario with small target cores and start offset + ( + ttnn.CoreCoord(3, 2), + 8, + ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(3, 9)), + ttnn.CoreRange(ttnn.CoreCoord(5, 0), ttnn.CoreCoord(6, 9)), + ] + ), + True, + ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(3, 2), ttnn.CoreCoord(3, 2)), + ttnn.CoreRange(ttnn.CoreCoord(1, 3), ttnn.CoreCoord(3, 4)), + ttnn.CoreRange(ttnn.CoreCoord(1, 5), ttnn.CoreCoord(1, 5)), + ] + ), + ), + # Test Case 4: col-wise scenario with small target cores and start offset + ( + ttnn.CoreCoord(1, 8), + 8, + ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(1, 0), ttnn.CoreCoord(3, 9)), + ttnn.CoreRange(ttnn.CoreCoord(5, 0), ttnn.CoreCoord(6, 9)), + ] + ), + False, + ttnn.CoreRangeSet( + [ + ttnn.CoreRange(ttnn.CoreCoord(1, 8), ttnn.CoreCoord(1, 9)), + ttnn.CoreRange(ttnn.CoreCoord(2, 0), ttnn.CoreCoord(2, 5)), + ] + ), + ), + ], +) +def test_numcores_to_corerangeset_in_subcoregrids( + start_core, num_cores, sub_core_grids, row_wise, expected_core_range_set +): + output_corerangeset = ttnn.num_cores_to_corerangeset_in_subcoregrids( + start_core, num_cores, sub_core_grids, row_wise=row_wise + ) + assert output_corerangeset.to_json() == expected_core_range_set.to_json() diff --git a/tt_metal/common/work_split.cpp b/tt_metal/common/work_split.cpp index f2a213a1721e..ba687d9d3dab 100644 --- a/tt_metal/common/work_split.cpp +++ b/tt_metal/common/work_split.cpp @@ -148,6 +148,123 @@ CoreRangeSet num_cores_to_corerangeset( return num_cores_to_corerangeset({0, 0}, target_num_cores, grid_size, row_wise); } +CoreRangeSet num_cores_to_corerangeset_in_subcoregrids( + const CoreCoord start_core, + const uint32_t target_num_cores, + const CoreRangeSet& sub_core_grids, + const bool row_wise = false) { + // If target_num_cores is 0 or input_corerangeset is empty, return empty CoreRangeSet + TT_FATAL(target_num_cores > 0, "Target number of cores must be greater than 0"); + TT_FATAL( + target_num_cores <= sub_core_grids.num_cores(), + "Target number of cores {} is greater than total number of available cores {}", + target_num_cores, + sub_core_grids.num_cores()); + + // Validate that the start core is contained within the entire CoreRangeSet + TT_FATAL(sub_core_grids.contains(start_core), "Start core must be contained within the input CoreRangeSet"); + + std::vector result_coreranges; + bool start_core_found = false; + CoreCoord current_start_core = start_core; + CoreCoord current_end_core = start_core; + uint32_t remaining_cores = target_num_cores; + + auto process_row_wise = [&](const CoreRange& subcoregrid) { + uint32_t subcoregrid_width = subcoregrid.grid_size().x; + + for (uint32_t y = current_start_core.y; y <= subcoregrid.end_coord.y; ++y) { + if (remaining_cores == 0) { + break; + } + + uint32_t current_width = + std::min(static_cast(subcoregrid.end_coord.x - current_start_core.x + 1), remaining_cores); + + if (current_width < subcoregrid_width) { + if (current_start_core != current_end_core) { + result_coreranges.push_back(CoreRange(current_start_core, current_end_core)); + } + + current_end_core = CoreCoord(current_start_core.x + current_width - 1, y); + remaining_cores -= current_width; + + result_coreranges.push_back( + CoreRange(CoreCoord(current_start_core.x, y), CoreCoord(current_end_core.x, y))); + + current_start_core = CoreCoord(subcoregrid.start_coord.x, y + 1); + current_end_core = current_start_core; + } else { + current_end_core = CoreCoord(subcoregrid.end_coord.x, y); + remaining_cores -= current_width; + } + } + + if (current_start_core != current_end_core) { + result_coreranges.push_back(CoreRange(current_start_core, current_end_core)); + } + }; + + auto process_col_wise = [&](const CoreRange& subcoregrid) { + uint32_t subcoregrid_height = subcoregrid.grid_size().y; + + for (uint32_t x = current_start_core.x; x <= subcoregrid.end_coord.x; ++x) { + if (remaining_cores == 0) { + break; + } + + uint32_t current_height = + std::min(static_cast(subcoregrid.end_coord.y - current_start_core.y + 1), remaining_cores); + + if (current_height < subcoregrid_height) { + if (current_start_core != current_end_core) { + result_coreranges.push_back(CoreRange(current_start_core, current_end_core)); + } + + current_end_core = CoreCoord(x, current_start_core.y + current_height - 1); + remaining_cores -= current_height; + + result_coreranges.push_back( + CoreRange(CoreCoord(x, current_start_core.y), CoreCoord(x, current_end_core.y))); + + current_start_core = CoreCoord(x + 1, subcoregrid.start_coord.y); + current_end_core = current_start_core; + } else { + current_end_core = CoreCoord(x, subcoregrid.end_coord.y); + remaining_cores -= current_height; + } + } + + if (current_start_core != current_end_core) { + result_coreranges.push_back(CoreRange(current_start_core, current_end_core)); + } + }; + + // Iterate over subcoregrids and process based on row_wise + for (const auto& subcoregrid : sub_core_grids.ranges()) { + if (subcoregrid.contains(start_core)) { + start_core_found = true; + } else { + if (!start_core_found) { + continue; + } else { + current_start_core = subcoregrid.start_coord; + current_end_core = current_start_core; + } + } + + if (row_wise) { + process_row_wise(subcoregrid); + } else { + process_col_wise(subcoregrid); + } + } + + TT_FATAL(remaining_cores == 0, "Failed to split target number of cores into CoreRangeSet"); + + return CoreRangeSet(std::move(result_coreranges)); +} + std::tuple split_work_to_cores( const CoreCoord grid_size, const uint32_t units_to_divide, const bool row_wise) { ZoneScoped; diff --git a/tt_metal/common/work_split.hpp b/tt_metal/common/work_split.hpp index 39cdec9bf210..2b5ae0ecb9d8 100644 --- a/tt_metal/common/work_split.hpp +++ b/tt_metal/common/work_split.hpp @@ -40,6 +40,11 @@ CoreRangeSet num_cores_to_corerangeset( CoreRangeSet num_cores_to_corerangeset( const uint32_t target_num_cores, const CoreCoord grid_size, const bool row_wise = false); +CoreRangeSet num_cores_to_corerangeset_in_subcoregrids( + const CoreCoord start_core, + const uint32_t target_num_cores, + const CoreRangeSet& sub_core_grids, + const bool row_wise = false); // This function takes in the core grid size, as well as the number of units of work to divide between the cores // This function returns the number of cores, the CoreRangeSet of all cores, and then the CoreRangeSet that does // the greater amount of work, and the CoreRangeSet that does less work if work cannot be evenly divided diff --git a/ttnn/cpp/pybind11/operations/core.hpp b/ttnn/cpp/pybind11/operations/core.hpp index 74da55f61da2..489ee330c5e0 100644 --- a/ttnn/cpp/pybind11/operations/core.hpp +++ b/ttnn/cpp/pybind11/operations/core.hpp @@ -363,6 +363,12 @@ void py_module(py::module& module) { "num_cores_to_corerangeset", py::overload_cast(&tt::tt_metal::num_cores_to_corerangeset), R"doc(Create a CoreRangeSet containing the specified number of cores)doc"); + + module.def( + "num_cores_to_corerangeset_in_subcoregrids", + py::overload_cast( + &tt::tt_metal::num_cores_to_corerangeset_in_subcoregrids), + R"doc(Create a CoreRangeSet containing the specified number of cores starting from start_core in given subcoregrids)doc"); } } // namespace core diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py index 8a968ea63ebb..041de7280180 100644 --- a/ttnn/ttnn/__init__.py +++ b/ttnn/ttnn/__init__.py @@ -209,6 +209,7 @@ def manage_config(name, value): load_memory_config, dump_stack_trace_on_segfault, num_cores_to_corerangeset, + num_cores_to_corerangeset_in_subcoregrids, ) import ttnn.reflection diff --git a/ttnn/ttnn/core.py b/ttnn/ttnn/core.py index 92d840f78683..a1879be24f94 100644 --- a/ttnn/ttnn/core.py +++ b/ttnn/ttnn/core.py @@ -59,6 +59,23 @@ def num_cores_to_corerangeset( ) +def num_cores_to_corerangeset_in_subcoregrids( + start_core: ttnn.CoreCoord, + target_num_cores: int, + sub_core_grids: ttnn.CoreRangeSet, + row_wise: bool = False, +): + """ + Create a CoreRangeSet containing the specified number of cores starting from start_core in given subcoregrids + """ + return ttnn._ttnn.operations.core.num_cores_to_corerangeset_in_subcoregrids( + start_core, + target_num_cores, + sub_core_grids, + row_wise, + ) + + def has_tile_padding(tensor, *, dim=None): if dim is not None: rank = tensor.shape.rank From a0959bac9fba181fe97284769412788b4868ad1c Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Fri, 6 Dec 2024 21:09:52 +0000 Subject: [PATCH 23/59] #0: Add reflection for std::map and std::unordered_map --- tt_metal/tt_stl/reflection.hpp | 129 +++++++++++++++++++++++++++++++-- 1 file changed, 123 insertions(+), 6 deletions(-) diff --git a/tt_metal/tt_stl/reflection.hpp b/tt_metal/tt_stl/reflection.hpp index 42c3aecb6a4b..e0c7cfd5199b 100644 --- a/tt_metal/tt_stl/reflection.hpp +++ b/tt_metal/tt_stl/reflection.hpp @@ -6,7 +6,10 @@ #include +#include +#include #include +#include #include #include #include @@ -14,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -448,6 +452,32 @@ std::ostream& operator<<(std::ostream& os, const std::set& set) { return os; } +template +std::ostream& operator<<(std::ostream& os, const std::map& map) { + os << "{"; + for (auto it = map.begin(); it != map.end(); ++it) { + os << it->first << ": " << it->second; + if (it != map.end()) { + os << ", "; + } + } + os << "}"; + return os; +} + +template +std::ostream& operator<<(std::ostream& os, const std::unordered_map& map) { + os << "{"; + for (auto it = map.begin(); it != map.end(); ++it) { + os << it->first << ": " << it->second; + if (it != map.end()) { + os << ", "; + } + } + os << "}"; + return os; +} + template requires(tt::stl::concepts::Reflectable and not(std::integral or std::is_array::value)) std::ostream& operator<<(std::ostream& os, const T& object) { @@ -978,6 +1008,30 @@ struct fmt::formatter> { } }; +template +struct fmt::formatter> { + constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator { return ctx.end(); } + + auto format(const std::map& map, format_context& ctx) const -> format_context::iterator { + using tt::stl::reflection::operator<<; + std::stringstream ss; + ss << map; + return fmt::format_to(ctx.out(), "{}", ss.str()); + } +}; + +template +struct fmt::formatter> { + constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator { return ctx.end(); } + + auto format(const std::unordered_map& map, format_context& ctx) const -> format_context::iterator { + using tt::stl::reflection::operator<<; + std::stringstream ss; + ss << map; + return fmt::format_to(ctx.out(), "{}", ss.str()); + } +}; + template requires( tt::stl::concepts::Reflectable and not(std::integral or std::is_array::value or @@ -1063,7 +1117,7 @@ inline hash_t hash_object(const T& object) noexcept { fmt::print("Hashing struct {} using compile-time attributes: {}\n", get_type_name(), object); } constexpr auto num_attributes = reflection::detail::get_num_attributes(); - std::size_t hash = 0; + hash_t hash = 0; const auto attribute_values = object.attribute_values(); [&object, &hash, &attribute_values](std::index_sequence) { ( @@ -1074,11 +1128,26 @@ inline hash_t hash_object(const T& object) noexcept { ...); }(std::make_index_sequence{}); return hash; + } else if constexpr (is_specialization_v) { + if constexpr (DEBUG_HASH_OBJECT_FUNCTION) { + fmt::print("Hashing std::tuple of type {}: {}\n", get_type_name(), object); + } + constexpr auto num_elements = std::tuple_size_v; + hash_t hash = 0; + [&object, &hash](std::index_sequence) { + ( + [&object, &hash] { + const auto& element = std::get(object); + hash = hash_objects(hash, element); + }(), + ...); + }(std::make_index_sequence{}); + return hash; } else if constexpr (is_specialization_v) { if constexpr (DEBUG_HASH_OBJECT_FUNCTION) { fmt::print("Hashing std::vector of type {}: {}\n", get_type_name(), object); } - auto hash = 0; + hash_t hash = 0; for (const auto& element : object) { hash = hash_objects(hash, element); } @@ -1087,11 +1156,37 @@ inline hash_t hash_object(const T& object) noexcept { if constexpr (DEBUG_HASH_OBJECT_FUNCTION) { fmt::print("Hashing std::set of type {}: {}\n", get_type_name(), object); } - auto hash = 0; + hash_t hash = 0; for (const auto& element : object) { hash = hash_objects(hash, element); } return hash; + } else if constexpr (is_specialization_v) { + if constexpr (DEBUG_HASH_OBJECT_FUNCTION) { + fmt::print("Hashing std::map of type {}: {}\n", get_type_name(), object); + } + hash_t hash = 0; + for (const auto& [key, value] : object) { + hash = hash_objects(hash, key, value); + } + return hash; + } else if constexpr (is_specialization_v) { + if constexpr (DEBUG_HASH_OBJECT_FUNCTION) { + fmt::print("Hashing std::unordered_map of type {}: {}\n", get_type_name(), object); + } + // Sort the unordered map by key to make the hash order invariant + std::vector iterators; + iterators.reserve(object.size()); + for (auto it = object.begin(); it != object.end(); ++it) { + iterators.push_back(it); + } + std::sort(iterators.begin(), iterators.end(), [](const auto& a, const auto& b) { return a->first < b->first; }); + + hash_t hash = 0; + for (const auto& it : iterators) { + hash = hash_objects(hash, it->first, it->second); + } + return hash; } else if constexpr (is_specialization_v) { if constexpr (DEBUG_HASH_OBJECT_FUNCTION) { fmt::print("Hashing std::optional of type {}: {}\n", get_type_name(), object); @@ -1105,7 +1200,7 @@ inline hash_t hash_object(const T& object) noexcept { if constexpr (DEBUG_HASH_OBJECT_FUNCTION) { fmt::print("Hashing struct {} using reflect library: {}\n", get_type_name(), object); } - std::size_t hash = 0; + hash_t hash = 0; reflect::for_each([&hash, &object](auto I) { hash = hash_objects(hash, reflect::get(object)); }, object); return hash; } else { @@ -1335,7 +1430,7 @@ struct to_json_t> { nlohmann::json operator()(const std::map& object) { nlohmann::json json_object = nlohmann::json::object(); for (const auto& [key, value] : object) { - json_object[to_json(key)] = to_json(value); + json_object[to_json(key).dump()] = to_json(value); } return json_object; } @@ -1346,7 +1441,29 @@ struct from_json_t> { std::map operator()(const nlohmann::json& json_object) { std::map object; for (const auto& [key, value] : json_object.items()) { - object[from_json(key)] = from_json(value); + object[from_json(nlohmann::json::parse(key))] = from_json(value); + } + return object; + } +}; + +template +struct to_json_t> { + nlohmann::json operator()(const std::unordered_map& object) { + nlohmann::json json_object = nlohmann::json::object(); + for (const auto& [key, value] : object) { + json_object[to_json(key).dump()] = to_json(value); + } + return json_object; + } +}; + +template +struct from_json_t> { + std::map operator()(const nlohmann::json& json_object) { + std::unordered_map object; + for (const auto& [key, value] : json_object.items()) { + object[from_json(nlohmann::json::parse(key))] = from_json(value); } return object; } From c2c2b16be7f0cb306104c4dd1a54d96a40b13c74 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Tue, 10 Dec 2024 01:00:48 +0000 Subject: [PATCH 24/59] #0: Add support for hashing global cbs, sems --- tt_metal/impl/buffers/global_circular_buffer.cpp | 9 +++++++++ tt_metal/impl/buffers/global_circular_buffer.hpp | 9 +++++++++ tt_metal/impl/buffers/global_semaphore.cpp | 9 +++++++++ tt_metal/impl/buffers/global_semaphore.hpp | 12 ++++++++++++ 4 files changed, 39 insertions(+) diff --git a/tt_metal/impl/buffers/global_circular_buffer.cpp b/tt_metal/impl/buffers/global_circular_buffer.cpp index 4d438e91fcce..2d8760f1af57 100644 --- a/tt_metal/impl/buffers/global_circular_buffer.cpp +++ b/tt_metal/impl/buffers/global_circular_buffer.cpp @@ -159,3 +159,12 @@ uint32_t GlobalCircularBuffer::size() const { return this->size_; } } // namespace v1 } // namespace tt::tt_metal + +namespace std { + +std::size_t hash::operator()( + const tt::tt_metal::v1::experimental::GlobalCircularBuffer& global_circular_buffer) const { + return tt::stl::hash::hash_objects_with_default_seed(global_circular_buffer); +} + +} // namespace std diff --git a/tt_metal/impl/buffers/global_circular_buffer.hpp b/tt_metal/impl/buffers/global_circular_buffer.hpp index c263fe47d002..d18ed91e0c47 100644 --- a/tt_metal/impl/buffers/global_circular_buffer.hpp +++ b/tt_metal/impl/buffers/global_circular_buffer.hpp @@ -76,3 +76,12 @@ class GlobalCircularBuffer { } // namespace v1 } // namespace tt::tt_metal + +namespace std { + +template <> +struct hash { + std::size_t operator()(const tt::tt_metal::v1::experimental::GlobalCircularBuffer& global_circular_buffer) const; +}; + +} // namespace std diff --git a/tt_metal/impl/buffers/global_semaphore.cpp b/tt_metal/impl/buffers/global_semaphore.cpp index 807e74a8e103..64d16beb377d 100644 --- a/tt_metal/impl/buffers/global_semaphore.cpp +++ b/tt_metal/impl/buffers/global_semaphore.cpp @@ -77,3 +77,12 @@ void GlobalSemaphore::reset_semaphore_value() { } } // namespace tt::tt_metal + +namespace std { + +std::size_t hash::operator()( + const tt::tt_metal::GlobalSemaphore& global_semaphore) const { + return tt::stl::hash::hash_objects_with_default_seed(global_semaphore); +} + +} // namespace std diff --git a/tt_metal/impl/buffers/global_semaphore.hpp b/tt_metal/impl/buffers/global_semaphore.hpp index 6c2f8d17947f..f6d657998f8e 100644 --- a/tt_metal/impl/buffers/global_semaphore.hpp +++ b/tt_metal/impl/buffers/global_semaphore.hpp @@ -44,6 +44,9 @@ class GlobalSemaphore { void reset_semaphore_value(); + static constexpr auto attribute_names = std::forward_as_tuple("cores", "initial_value"); + const auto attribute_values() const { return std::make_tuple(this->cores_, this->initial_value_); } + private: void setup_buffer(BufferType buffer_type); @@ -59,3 +62,12 @@ class GlobalSemaphore { } // namespace v0 } // namespace tt::tt_metal + +namespace std { + +template <> +struct hash { + std::size_t operator()(const tt::tt_metal::GlobalSemaphore& global_semaphore) const; +}; + +} // namespace std From 60373c88f423718b2a87031ac87e502fd2af4d0e Mon Sep 17 00:00:00 2001 From: Andrija Malbasa Date: Tue, 10 Dec 2024 09:07:20 +0100 Subject: [PATCH 25/59] Add rotary embedding sweep (#15743) --- .github/workflows/ttnn-run-sweeps.yaml | 1 + tests/sweep_framework/sweep_utils/utils.py | 22 +++ .../rotary_embedding/rotary_embedding.py | 137 ++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 tests/sweep_framework/sweeps/transformer/rotary_embedding/rotary_embedding.py diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml index 99ea50f9860f..22ef67bb730c 100644 --- a/.github/workflows/ttnn-run-sweeps.yaml +++ b/.github/workflows/ttnn-run-sweeps.yaml @@ -348,6 +348,7 @@ on: - transformer.split_query_key_value_and_split_heads.split_query_key_value_and_split_heads_kv_input - transformer.attention_softmax.attention_softmax - transformer.attention_softmax.attention_softmax_ + - transformer.rotary_embedding.rotary_embedding - data_movement.stack.stack_pytorch2 - data_movement.repeat.repeat_pytorch2 - data_movement.split.split_pytorch2 diff --git a/tests/sweep_framework/sweep_utils/utils.py b/tests/sweep_framework/sweep_utils/utils.py index 6f2199055d81..ef6349c71e9b 100644 --- a/tests/sweep_framework/sweep_utils/utils.py +++ b/tests/sweep_framework/sweep_utils/utils.py @@ -220,6 +220,28 @@ def gen_split_qkv_heads_spec( } +def gen_rotary_embedding_spec( + input_shape_list, + cache_size_list, + use_token_idx_list=[True, False], +): + for input_shape, cache_size, use_token_idx in itertools.product( + input_shape_list, cache_size_list, use_token_idx_list + ): + input_shape_ = input_shape.copy() + if use_token_idx is True: + token_idx = random.randint(1, cache_size - 1) + input_shape_[0] = 1 + else: + token_idx = None + + yield { + "input_shape": input_shape_, + "cache_size": cache_size, + "token_idx": token_idx, + } + + def gen_complex_tensor(input_shape, low, high, dtype=ttnn.bfloat16): torch_real = gen_func_with_cast_tt(partial(torch_random, low=-100, high=100, dtype=torch.float32), dtype)( input_shape diff --git a/tests/sweep_framework/sweeps/transformer/rotary_embedding/rotary_embedding.py b/tests/sweep_framework/sweeps/transformer/rotary_embedding/rotary_embedding.py new file mode 100644 index 000000000000..9c39ab1ae1fd --- /dev/null +++ b/tests/sweep_framework/sweeps/transformer/rotary_embedding/rotary_embedding.py @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Tuple +from functools import partial + +import torch +import random +import ttnn +from tests.sweep_framework.sweep_utils.utils import gen_shapes, gen_rotary_embedding_spec +from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +# Override the default timeout in seconds for hang detection. +TIMEOUT = 30 + +random.seed(0) + + +# Parameters provided to the test vector generator are defined here. +# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values. +# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs. +# Developers can create their own generator functions and pass them to the parameters as inputs. +parameters = { + "nightly": { + "input_spec": gen_rotary_embedding_spec( + input_shape_list=gen_shapes([1, 1, 32, 64], [6, 12, 256, 512], [1, 1, 32, 64], 16), + cache_size_list=[random.randint(1, 2048) for i in range(8)], + ), + "input_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], + "input_layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT], + "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], + }, +} + + +def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]: + if test_vector["input_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_dtype"] == ttnn.bfloat8_b: + return True, "bfloat8_b/bfloat4_b requires TILE_LAYOUT!" + if test_vector["input_spec"]["input_shape"][-1] % 64 != 0: + return True, "Input X dimension (133) must be divisible by 64 for tiling" + if test_vector["input_spec"]["token_idx"] and test_vector["input_spec"]["input_shape"][0] != 1: + return True, "When passing token_idx, sequence length must be 1" + return False, None + + +# This is the run instructions for the test, defined by the developer. +# The run function must take the above-defined parameters as inputs. +# The runner will call this run function with each test vector, and the returned results from this function will be stored. +# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra. +def run( + input_spec, + input_dtype, + input_layout, + input_memory_config, + output_memory_config, + *, + device, +) -> list: + data_seed = random.randint(0, 20000000) + torch.manual_seed(data_seed) + + input_shape, cache_size, token_idx = input_spec.values() + seq_length, batch_size, num_heads, head_dim = input_shape + + sin_cos_cache_shape = [1, 1, cache_size, head_dim] + + torch_input_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype + )(input_shape) + torch_cos_cache_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype + )(sin_cos_cache_shape) + torch_sin_cache_tensor = gen_func_with_cast_tt( + partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype + )(sin_cos_cache_shape) + + if token_idx: + golden_function = partial(ttnn.get_golden_function(ttnn.experimental.rotary_embedding), token_idx=token_idx) + else: + + def rotate_half(x): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(x, cos_cached, sin_cached, token_idx=None): + seq_len = x.shape[-2] + if token_idx is None: + cos = cos_cached[:, :, :seq_len, ...] + sin = sin_cached[:, :, :seq_len, ...] + else: + cos = cos_cached[:, :, token_idx : token_idx + 1, ...] + sin = sin_cached[:, :, token_idx : token_idx + 1, ...] + + x_embed = (x * cos) + (rotate_half(x) * sin) + return x_embed + + golden_function = apply_rotary_pos_emb + + torch_output_tensor = golden_function(torch_input_tensor, torch_cos_cache_tensor, torch_sin_cache_tensor) + + input_tensor = ttnn.from_torch( + torch_input_tensor, + dtype=input_dtype, + layout=input_layout, + device=device, + memory_config=input_memory_config, + ) + cos_cache_tensor = ttnn.from_torch( + torch_cos_cache_tensor, + dtype=input_dtype, + layout=input_layout, + device=device, + memory_config=input_memory_config, + ) + sin_cache_tensor = ttnn.from_torch( + torch_sin_cache_tensor, + dtype=input_dtype, + layout=input_layout, + device=device, + memory_config=input_memory_config, + ) + + start_time = start_measuring_time() + output_tensor = ttnn.experimental.rotary_embedding( + input_tensor, cos_cache_tensor, sin_cache_tensor, token_idx, memory_config=output_memory_config + ) + e2e_perf = stop_measuring_time(start_time) + + output_tensor = ttnn.to_torch(output_tensor) + + return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] From a27532012ca611ed8a3eabb39fcca8b8ad8dc869 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Tue, 10 Dec 2024 16:11:01 +0530 Subject: [PATCH 26/59] Move pybind11 to CPM (#15825) --- .gitmodules | 3 --- CMakeLists.txt | 12 ------------ dependencies/CMakeLists.txt | 6 ++++++ tt_metal/third_party/pybind11 | 1 - ttnn/CMakeLists.txt | 2 +- 5 files changed, 7 insertions(+), 17 deletions(-) delete mode 160000 tt_metal/third_party/pybind11 diff --git a/.gitmodules b/.gitmodules index d20bc574cc3a..4029f9918d62 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "third_party/pybind11"] - path = tt_metal/third_party/pybind11 - url = https://github.com/pybind/pybind11.git [submodule "third_party/lfs"] path = tt_metal/third_party/lfs url = https://github.com/tenstorrent-metal/lfs.git diff --git a/CMakeLists.txt b/CMakeLists.txt index a4146a61cb94..f929b62c9890 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -259,18 +259,6 @@ if(ENABLE_TRACY) add_link_options(-rdynamic) endif() -if(WITH_PYTHON_BINDINGS) - # Can't use the `REUSE_FROM` option bc tt_lib and ttnn have different build flags :( - add_library(pch_pybinds INTERFACE) - target_precompile_headers( - pch_pybinds - INTERFACE - ${PROJECT_SOURCE_DIR}/tt_metal/third_party/pybind11/include/pybind11/operators.h - ${PROJECT_SOURCE_DIR}/tt_metal/third_party/pybind11/include/pybind11/pybind11.h - ${PROJECT_SOURCE_DIR}/tt_metal/third_party/pybind11/include/pybind11/stl.h - ) -endif() - ############################################################################################################################ # Build subdirectories ############################################################################################################################ diff --git a/dependencies/CMakeLists.txt b/dependencies/CMakeLists.txt index 7369f655c1cb..e14310435a44 100644 --- a/dependencies/CMakeLists.txt +++ b/dependencies/CMakeLists.txt @@ -85,3 +85,9 @@ CPMAddPackage(NAME fmt GITHUB_REPOSITORY fmtlib/fmt GIT_TAG 11.0.1) ############################################################################################################################ CPMAddPackage(NAME range-v3 GITHUB_REPOSITORY ericniebler/range-v3 GIT_TAG 0.12.0) + +############################################################################################################################ +# pybind11 : https://github.com/pybind/pybind11 +############################################################################################################################ + +CPMAddPackage(NAME pybind11 GITHUB_REPOSITORY pybind/pybind11 GIT_TAG b8f28551cc3a98ea9fbfc15c05b513c8f2d23e84) diff --git a/tt_metal/third_party/pybind11 b/tt_metal/third_party/pybind11 deleted file mode 160000 index b8f28551cc3a..000000000000 --- a/tt_metal/third_party/pybind11 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b8f28551cc3a98ea9fbfc15c05b513c8f2d23e84 diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index fae0eddc6790..1da988236ab1 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -649,7 +649,7 @@ if(WITH_PYTHON_BINDINGS) list( APPEND TTNN_PUBLIC_LINK_LIBRARIES - pch_pybinds + pybind11::module ${Python3_LIBRARIES} ) endif() From 4bcc79b1ab03b10f707393430f19fbfa06f03a9b Mon Sep 17 00:00:00 2001 From: Miguel Tairum <150826086+mtairum@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:50:41 +0000 Subject: [PATCH 27/59] [Llama3] Add test-accuracy to CI (#15778) --- .../workflows/t3000-frequent-tests-impl.yaml | 1 + models/demos/llama3/PERF.md | 8 ++-- .../single_card/run_single_card_demo_tests.sh | 14 +++++++ .../scripts/t3000/run_t3000_frequent_tests.sh | 39 ++++++++++++++++++- 4 files changed, 57 insertions(+), 5 deletions(-) diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml index fde2ede1652e..11a2df7b1465 100644 --- a/.github/workflows/t3000-frequent-tests-impl.yaml +++ b/.github/workflows/t3000-frequent-tests-impl.yaml @@ -21,6 +21,7 @@ jobs: { name: "t3k llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich { name: "t3k n300 mesh llama3.2-vision tests", arch: wormhole_b0, cmd: run_t3000_spoof_n300_llama3.2-11b-vision_freq_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich { name: "t3k llama3 tests", arch: wormhole_b0, cmd: run_t3000_llama3_tests, timeout: 45, owner_id: U03PUAKE719}, #Miguel Tairum Cruz + { name: "t3k llama3 accuracy tests", arch: wormhole_b0, cmd: run_t3000_llama3_accuracy_tests, timeout: 45, owner_id: U03PUAKE719}, #Miguel Tairum Cruz { name: "t3k llama2_70b tests", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich # { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 45, owner_id: U03FJB5TM5Y}, #Colman Glagovich # FIXME issue #14934 { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, #Miguel Tairum Cruz diff --git a/models/demos/llama3/PERF.md b/models/demos/llama3/PERF.md index dd060a14c1c7..f0dbf00ec4b0 100644 --- a/models/demos/llama3/PERF.md +++ b/models/demos/llama3/PERF.md @@ -12,16 +12,16 @@ This configuration uses bfp4 MLP FF1+FF3 for all models. |-------|--------|-----------|-----------|---------------| | 1b | N150 | 79 | 98 | 90.5 | | 1b | N300 | 81 | 98 | 101.7 | -| 1b | T3K | 81 | 98 | 97.5 | +| 1b | T3K | 81 | 98 | 96.8 | | 3b | N150 | 85 | 96 | 49.0 | | 3b | N300 | 88 | 97 | 56.9 | | 3b | T3K | 88 | 97 | 54.5 | | 8b | N150 | 86 | 98 | 28.4 | | 8b | N300 | 84 | 98 | 38.6 | -| 8b | T3K | 84 | 98 | 52.6 | +| 8b | T3K | 84 | 97 | 52.6 | | 11b | N300 | 86 | 97 | 38.6 | | 11b | T3K | 84 | 98 | 52.6 | -| 70b | T3K | 95 | 100 | 14.3 | +| 70b | T3K | 94 | 100 | 14.3 | ## LlamaOptimizations.accuracy @@ -40,4 +40,4 @@ This configuration uses bfp4 MLP FF1+FF3 only for the 3.1-70B model. | 8b | T3K | 88 | 97 | 49.9 | | 11b | N300 | 90 | 97 | 33.8 | | 11b | T3K | 88 | 97 | 52.6 | -| 70b | T3K | 95 | 100 | 14.5 | +| 70b | T3K | 94 | 100 | 14.5 | diff --git a/tests/scripts/single_card/run_single_card_demo_tests.sh b/tests/scripts/single_card/run_single_card_demo_tests.sh index 053f5b966af4..11a4e96a8954 100755 --- a/tests/scripts/single_card/run_single_card_demo_tests.sh +++ b/tests/scripts/single_card/run_single_card_demo_tests.sh @@ -15,6 +15,20 @@ run_common_func_tests() { # Qwen7B QWEN_DIR=/mnt/MLPerf/tt_dnn-models/qwen/Qwen2-7B-Instruct WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml FAKE_DEVICE=N150 pytest -n auto models/demos/qwen/demo/demo.py -k instruct --timeout 420; fail+=$? + # Llama3 Accuracy tests + # Llama3.2-1B + llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/ + # Llama3.2-3B + llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/ + # Llama3.1-8B (11B weights are the same) + llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/ + + # Run Llama3 accuracy tests for 1B, 3B, 8B weights + for llama_dir in "$llama1b" "$llama3b" "$llama8b"; do + LLAMA_DIR=$llama_dir WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/llama3/tests/test_llama_accuracy.py -k perf --timeout 420; fail+=$? + echo "LOG_METAL: Llama3 accuracy tests for $llama_dir completed" + done + #VGG11/VGG16 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/vgg/demo/demo.py --timeout 600; fail+=$? diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh index 0058a3fc9e3f..3ade2f433552 100755 --- a/tests/scripts/t3000/run_t3000_frequent_tests.sh +++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh @@ -63,7 +63,7 @@ run_t3000_llama3_tests() { # Run test model for llama3 - 1B, 3B, 8B and 11B weights for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b"; do LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_model.py -k full ; fail+=$? - # LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_model_prefill.py ; fail+=$? # FIXME Issue #14843 + LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_model_prefill.py ; fail+=$? echo "LOG_METAL: Llama3 tests for $llama_dir completed" done @@ -96,6 +96,40 @@ run_t3000_llama3_70b_tests() { fi } +run_t3000_llama3_accuracy_tests() { + # Record the start time + fail=0 + start_time=$(date +%s) + + echo "LOG_METAL: Running run_t3000_llama3_accuracy_tests" + + wh_arch_yaml=wormhole_b0_80_arch_eth_dispatch.yaml + # Llama3.2-1B + llama1b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-1B-Instruct/ + # Llama3.2-3B + llama3b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-3B-Instruct/ + # Llama3.1-8B + llama8b=/mnt/MLPerf/tt_dnn-models/llama/Meta-Llama-3.1-8B-Instruct/ + # Llama3.2-11B + llama11b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.2-11B-Vision-Instruct/ + # Llama3.1-70B + llama70b=/mnt/MLPerf/tt_dnn-models/llama/Llama3.1-70B-Instruct/ + + # Run test accuracy llama3 - 1B, 3B, 8B, 11B and 70B weights + for llama_dir in "$llama1b" "$llama3b" "$llama8b" "$llama11b" "$llama70b"; do + LLAMA_DIR=$llama_dir WH_ARCH_YAML=$wh_arch_yaml pytest -n auto models/demos/llama3/tests/test_llama_accuracy.py -k perf ; fail+=$? + echo "LOG_METAL: Llama3 accuracy tests for $llama_dir completed" + done + + # Record the end time + end_time=$(date +%s) + duration=$((end_time - start_time)) + echo "LOG_METAL: run_t3000_llama3_accuracy_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi +} + run_t3000_llama3.2-11b-vision_freq_tests() { # Record the start time fail=0 @@ -277,6 +311,9 @@ run_t3000_tests() { # Run llama3-70b tests run_t3000_llama3_70b_tests + # Run llama3 accuracy tests + run_t3000_llama3_accuracy_tests + # Run Llama3.2-11B Vision tests run_t3000_llama3.2-11b-vision_freq_tests From 969c680cf9e8b63d95ff2aac89b2ced5a872905e Mon Sep 17 00:00:00 2001 From: Borys Bradel <164946524+bbradelTT@users.noreply.github.com> Date: Tue, 10 Dec 2024 09:26:45 -0500 Subject: [PATCH 28/59] #17532: Add relevant tests to matmul trace sweeps (#15850) ### Ticket Link to Github Issue #15732 ### Problem description Need tests for various shapes to verify that they pass after making changes ### What's changed Create the tests based on a trace of a run of the model ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12247713568 - [ ] Blackhole Post commit (if applicable) N/A - [ ] Model regression CI testing passes (if applicable) N/A - [ ] Device performance regression CI testing passes (if applicable) N/A - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes N/A - [x] New/Existing tests provide coverage for changes --- .../sweeps/matmul/short/matmul_traces.py | 258 +++++++++++++++++- 1 file changed, 253 insertions(+), 5 deletions(-) diff --git a/tests/sweep_framework/sweeps/matmul/short/matmul_traces.py b/tests/sweep_framework/sweeps/matmul/short/matmul_traces.py index c66e55488345..0b6c9031553e 100644 --- a/tests/sweep_framework/sweeps/matmul/short/matmul_traces.py +++ b/tests/sweep_framework/sweeps/matmul/short/matmul_traces.py @@ -13,6 +13,10 @@ TIMEOUT = 70 +# params contains the shape of the first tensor followed by the second tensor +# Note: the shape of the second tensor starts at int(count / 2). It's easiest +# to reason about if both tensors are the same rank, although some other +# combinations may be valid. parameters = { "default": { "params": [ @@ -111,25 +115,269 @@ (9, 768, 768, 640), (920, 256, 256, 256), ], - } + "core_grid": [False], + }, + "gpt": { + "params": [ + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 2304), + (1, 1, 1, 1, 1, 1, 1, 3072), + (1, 1, 1, 1, 1, 1, 1, 65536), + (1, 1, 1, 1, 1, 1, 1, 768), + (1, 1, 1, 1, 1, 1, 1, 96), + (1, 1, 1, 2304, 1, 1, 2304, 1), + (1, 1, 1, 2304, 1, 1, 2304, 65536), + (1, 1, 1, 2304, 1, 1, 2304, 768), + (1, 1, 1, 3072, 1, 1, 3072, 1), + (1, 1, 1, 3072, 1, 1, 3072, 65536), + (1, 1, 1, 3072, 1, 1, 3072, 768), + (1, 1, 1, 65536, 1, 1, 65536, 2304), + (1, 1, 1, 65536, 1, 1, 65536, 3072), + (1, 1, 1, 65536, 1, 1, 65536, 768), + (1, 1, 1, 65536, 1, 1, 65536, 96), + (1, 1, 1, 768, 1, 1, 768, 1), + (1, 1, 1, 768, 1, 1, 768, 1024), + (1, 1, 1, 768, 1, 1, 768, 2304), + (1, 1, 1, 768, 1, 1, 768, 3072), + (1, 1, 1, 768, 1, 1, 768, 65536), + (1, 1, 1, 768, 1, 1, 768, 768), + (1, 1, 1, 768, 1, 1, 768, 96), + (1, 1, 1, 96, 1, 1, 96, 1), + (1, 1, 1, 96, 1, 1, 96, 65536), + (1, 1, 1, 96, 1, 1, 96, 768), + (1, 1, 1024, 768, 1, 1, 768, 1), + (1, 1, 1024, 768, 1, 1, 768, 1024), + (1, 1, 1024, 768, 1, 1, 768, 2304), + (1, 1, 1024, 768, 1, 1, 768, 3072), + (1, 1, 1024, 768, 1, 1, 768, 65536), + (1, 1, 1024, 768, 1, 1, 768, 768), + (1, 1, 1024, 768, 1, 1, 768, 96), + (1, 1, 2304, 1, 1, 1, 1, 1), + (1, 1, 2304, 1, 1, 1, 1, 2304), + (1, 1, 2304, 1, 1, 1, 1, 3072), + (1, 1, 2304, 1, 1, 1, 1, 65536), + (1, 1, 2304, 1, 1, 1, 1, 768), + (1, 1, 2304, 1, 1, 1, 1, 96), + (1, 1, 2304, 65536, 1, 1, 65536, 1), + (1, 1, 2304, 65536, 1, 1, 65536, 2304), + (1, 1, 2304, 65536, 1, 1, 65536, 3072), + (1, 1, 2304, 65536, 1, 1, 65536, 768), + (1, 1, 2304, 65536, 1, 1, 65536, 96), + (1, 1, 2304, 768, 1, 1, 768, 1), + (1, 1, 2304, 768, 1, 1, 768, 1024), + (1, 1, 2304, 768, 1, 1, 768, 2304), + (1, 1, 2304, 768, 1, 1, 768, 3072), + (1, 1, 2304, 768, 1, 1, 768, 65536), + (1, 1, 2304, 768, 1, 1, 768, 768), + (1, 1, 2304, 768, 1, 1, 768, 96), + (1, 1, 3072, 1, 1, 1, 1, 1), + (1, 1, 3072, 1, 1, 1, 1, 2304), + (1, 1, 3072, 1, 1, 1, 1, 3072), + (1, 1, 3072, 1, 1, 1, 1, 65536), + (1, 1, 3072, 1, 1, 1, 1, 768), + (1, 1, 3072, 1, 1, 1, 1, 96), + (1, 1, 3072, 65536, 1, 1, 65536, 1), + (1, 1, 3072, 65536, 1, 1, 65536, 2304), + (1, 1, 3072, 65536, 1, 1, 65536, 3072), + (1, 1, 3072, 65536, 1, 1, 65536, 768), + (1, 1, 3072, 65536, 1, 1, 65536, 96), + (1, 1, 3072, 768, 1, 1, 768, 1), + (1, 1, 3072, 768, 1, 1, 768, 1024), + (1, 1, 3072, 768, 1, 1, 768, 2304), + (1, 1, 3072, 768, 1, 1, 768, 3072), + (1, 1, 3072, 768, 1, 1, 768, 65536), + (1, 1, 3072, 768, 1, 1, 768, 768), + (1, 1, 3072, 768, 1, 1, 768, 96), + (1, 1, 65536, 1, 1, 1, 1, 1), + (1, 1, 65536, 1, 1, 1, 1, 2304), + (1, 1, 65536, 1, 1, 1, 1, 3072), + (1, 1, 65536, 1, 1, 1, 1, 65536), + (1, 1, 65536, 1, 1, 1, 1, 768), + (1, 1, 65536, 1, 1, 1, 1, 96), + (1, 1, 65536, 2304, 1, 1, 2304, 1), + (1, 1, 65536, 2304, 1, 1, 2304, 65536), + (1, 1, 65536, 2304, 1, 1, 2304, 768), + (1, 1, 65536, 3072, 1, 1, 3072, 1), + (1, 1, 65536, 3072, 1, 1, 3072, 65536), + (1, 1, 65536, 3072, 1, 1, 3072, 768), + (1, 1, 65536, 768, 1, 1, 768, 1), + (1, 1, 65536, 768, 1, 1, 768, 1024), + (1, 1, 65536, 768, 1, 1, 768, 2304), + (1, 1, 65536, 768, 1, 1, 768, 3072), + (1, 1, 65536, 768, 1, 1, 768, 65536), + (1, 1, 65536, 768, 1, 1, 768, 768), + (1, 1, 65536, 768, 1, 1, 768, 96), + (1, 1, 65536, 96, 1, 1, 96, 65536), + (1, 1, 65536, 96, 1, 1, 96, 768), + (1, 1, 768, 1, 1, 1, 1, 1), + (1, 1, 768, 1, 1, 1, 1, 2304), + (1, 1, 768, 1, 1, 1, 1, 3072), + (1, 1, 768, 1, 1, 1, 1, 65536), + (1, 1, 768, 1, 1, 1, 1, 768), + (1, 1, 768, 1, 1, 1, 1, 96), + (1, 1, 768, 1024, 1, 1, 1024, 768), + (1, 1, 768, 2304, 1, 1, 2304, 1), + (1, 1, 768, 2304, 1, 1, 2304, 65536), + (1, 1, 768, 2304, 1, 1, 2304, 768), + (1, 1, 768, 3072, 1, 1, 3072, 1), + (1, 1, 768, 3072, 1, 1, 3072, 65536), + (1, 1, 768, 3072, 1, 1, 3072, 768), + (1, 1, 768, 65536, 1, 1, 65536, 1), + (1, 1, 768, 65536, 1, 1, 65536, 2304), + (1, 1, 768, 65536, 1, 1, 65536, 3072), + (1, 1, 768, 65536, 1, 1, 65536, 768), + (1, 1, 768, 65536, 1, 1, 65536, 96), + (1, 1, 768, 768, 1, 1, 768, 1), + (1, 1, 768, 768, 1, 1, 768, 1024), + (1, 1, 768, 768, 1, 1, 768, 2304), + (1, 1, 768, 768, 1, 1, 768, 3072), + (1, 1, 768, 768, 1, 1, 768, 65536), + (1, 1, 768, 768, 1, 1, 768, 768), + (1, 1, 768, 768, 1, 1, 768, 96), + (1, 1, 768, 96, 1, 1, 96, 1), + (1, 1, 768, 96, 1, 1, 96, 65536), + (1, 1, 768, 96, 1, 1, 96, 768), + (1, 1, 96, 1, 1, 1, 1, 1), + (1, 1, 96, 1, 1, 1, 1, 2304), + (1, 1, 96, 1, 1, 1, 1, 3072), + (1, 1, 96, 1, 1, 1, 1, 65536), + (1, 1, 96, 1, 1, 1, 1, 768), + (1, 1, 96, 1, 1, 1, 1, 96), + (1, 1, 96, 65536, 1, 1, 65536, 1), + (1, 1, 96, 65536, 1, 1, 65536, 2304), + (1, 1, 96, 65536, 1, 1, 65536, 3072), + (1, 1, 96, 65536, 1, 1, 65536, 768), + (1, 1, 96, 65536, 1, 1, 65536, 96), + (1, 1, 96, 768, 1, 1, 768, 1), + (1, 1, 96, 768, 1, 1, 768, 1024), + (1, 1, 96, 768, 1, 1, 768, 2304), + (1, 1, 96, 768, 1, 1, 768, 3072), + (1, 1, 96, 768, 1, 1, 768, 65536), + (1, 1, 96, 768, 1, 1, 768, 768), + (1, 1, 96, 768, 1, 1, 768, 96), + (1, 64, 1024, 768, 1, 1, 768, 1), + (1, 64, 1024, 768, 1, 1, 768, 2304), + (1, 64, 1024, 768, 1, 1, 768, 3072), + (1, 64, 1024, 768, 1, 1, 768, 65536), + (1, 64, 1024, 768, 1, 1, 768, 768), + (1, 64, 1024, 768, 1, 1, 768, 96), + (1, 64, 768, 1024, 1, 1, 1024, 768), + (1, 64, 768, 1024, 1, 64, 1024, 768), + (64, 1, 1, 1024, 1, 1, 1024, 768), + (64, 1, 1, 1024, 64, 1, 1024, 1), + (64, 1, 1, 1024, 64, 1, 1024, 2304), + (64, 1, 1, 1024, 64, 1, 1024, 3072), + (64, 1, 1, 1024, 64, 1, 1024, 768), + (64, 1, 1, 1024, 64, 1, 1024, 96), + (64, 1, 1, 768, 1, 1, 768, 1), + (64, 1, 1, 768, 1, 1, 768, 1024), + (64, 1, 1, 768, 1, 1, 768, 2304), + (64, 1, 1, 768, 1, 1, 768, 3072), + (64, 1, 1, 768, 1, 1, 768, 65536), + (64, 1, 1, 768, 1, 1, 768, 768), + (64, 1, 1, 768, 1, 1, 768, 96), + (64, 1, 1, 768, 64, 1, 768, 1), + (64, 1, 1, 768, 64, 1, 768, 1024), + (64, 1, 1024, 1, 1, 1, 1, 2304), + (64, 1, 1024, 1, 1, 1, 1, 3072), + (64, 1, 1024, 1, 1, 1, 1, 768), + (64, 1, 1024, 1, 1, 1, 1, 96), + (64, 1, 1024, 1, 64, 1, 1, 1024), + (64, 1, 1024, 1, 64, 1, 1, 768), + (64, 1, 1024, 2304, 1, 1, 2304, 65536), + (64, 1, 1024, 2304, 1, 1, 2304, 768), + (64, 1, 1024, 2304, 64, 1, 2304, 1024), + (64, 1, 1024, 3072, 1, 1, 3072, 1), + (64, 1, 1024, 3072, 1, 1, 3072, 65536), + (64, 1, 1024, 3072, 1, 1, 3072, 768), + (64, 1, 1024, 768, 1, 1, 768, 1), + (64, 1, 1024, 768, 1, 1, 768, 1024), + (64, 1, 1024, 768, 1, 1, 768, 2304), + (64, 1, 1024, 768, 1, 1, 768, 3072), + (64, 1, 1024, 768, 1, 1, 768, 65536), + (64, 1, 1024, 768, 1, 1, 768, 768), + (64, 1, 1024, 768, 1, 1, 768, 96), + (64, 1, 1024, 768, 64, 1, 768, 1024), + (64, 1, 1024, 96, 1, 1, 96, 65536), + (64, 1, 1024, 96, 1, 1, 96, 768), + (64, 1, 1024, 96, 64, 1, 96, 1024), + (64, 1, 2304, 1024, 1, 1, 1024, 768), + (64, 1, 2304, 1024, 64, 1, 1024, 1), + (64, 1, 2304, 1024, 64, 1, 1024, 2304), + (64, 1, 2304, 1024, 64, 1, 1024, 3072), + (64, 1, 2304, 1024, 64, 1, 1024, 768), + (64, 1, 2304, 1024, 64, 1, 1024, 96), + (64, 1, 3072, 1024, 1, 1, 1024, 768), + (64, 1, 3072, 1024, 64, 1, 1024, 1), + (64, 1, 3072, 1024, 64, 1, 1024, 2304), + (64, 1, 3072, 1024, 64, 1, 1024, 3072), + (64, 1, 3072, 1024, 64, 1, 1024, 768), + (64, 1, 3072, 1024, 64, 1, 1024, 96), + (64, 1, 768, 1, 1, 1, 1, 2304), + (64, 1, 768, 1, 1, 1, 1, 3072), + (64, 1, 768, 1, 1, 1, 1, 768), + (64, 1, 768, 1, 1, 1, 1, 96), + (64, 1, 768, 1, 64, 1, 1, 768), + (64, 1, 768, 1024, 1, 1, 1024, 768), + (64, 1, 768, 1024, 64, 1, 1024, 1), + (64, 1, 768, 1024, 64, 1, 1024, 2304), + (64, 1, 768, 1024, 64, 1, 1024, 3072), + (64, 1, 768, 1024, 64, 1, 1024, 768), + (64, 1, 768, 1024, 64, 1, 1024, 96), + (64, 1, 96, 1024, 1, 1, 1024, 768), + (64, 1, 96, 1024, 64, 1, 1024, 1), + (64, 1, 96, 1024, 64, 1, 1024, 2304), + (64, 1, 96, 1024, 64, 1, 1024, 3072), + (64, 1, 96, 1024, 64, 1, 1024, 768), + (64, 1, 96, 1024, 64, 1, 1024, 96), + (64, 12, 1, 1024, 1, 1, 1024, 768), + (64, 12, 1, 1024, 64, 12, 1024, 1), + (64, 12, 1, 1024, 64, 12, 1024, 1024), + (64, 12, 1, 1024, 64, 12, 1024, 64), + (64, 12, 1024, 1, 1, 1, 1, 1), + (64, 12, 1024, 1, 1, 1, 1, 2304), + (64, 12, 1024, 1, 1, 1, 1, 3072), + (64, 12, 1024, 1, 1, 1, 1, 768), + (64, 12, 1024, 1, 1, 1, 1, 96), + (64, 12, 1024, 1, 64, 12, 1, 1024), + (64, 12, 1024, 1024, 1, 1, 1024, 768), + (64, 12, 1024, 1024, 64, 12, 1024, 1), + (64, 12, 1024, 1024, 64, 12, 1024, 1024), + (64, 12, 1024, 1024, 64, 12, 1024, 64), + (64, 12, 1024, 64, 64, 12, 64, 1024), + (64, 12, 64, 1024, 1, 1, 1024, 768), + (64, 12, 64, 1024, 64, 12, 1024, 1), + (64, 12, 64, 1024, 64, 12, 1024, 1024), + (64, 12, 64, 1024, 64, 12, 1024, 64), + ], + "core_grid": [True, False], + }, } def run( params, + core_grid, *, device, ) -> list: - [in0_h, in0_w, in1_h, in1_w] = params - torch_input_tensor0 = torch.rand([in0_h, in0_w], dtype=torch.float32) - torch_input_tensor1 = torch.rand([in1_h, in1_w], dtype=torch.float32) + if core_grid == False: + grid = None + else: + grid = device.core_grid + count = len(params) + half = int(count / 2) + shape0 = params[0:half] + shape1 = params[half:count] + torch_input_tensor0 = torch.rand(shape0, dtype=torch.float32) + torch_input_tensor1 = torch.rand(shape1, dtype=torch.float32) torch_output_tensor = torch.matmul(torch_input_tensor0, torch_input_tensor1) input_tensor0 = ttnn.from_torch(torch_input_tensor0, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) input_tensor1 = ttnn.from_torch(torch_input_tensor1, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) start_time = start_measuring_time() - output_tensor = ttnn.matmul(input_tensor0, input_tensor1) + output_tensor = ttnn.matmul(input_tensor0, input_tensor1, core_grid=grid) output_tensor = ttnn.to_torch(output_tensor) e2e_perf = stop_measuring_time(start_time) expected_pcc = 0.99 From 1bef3e0ba5e6a2206ad7bfce5214e2eba9610e66 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Sun, 8 Dec 2024 01:18:58 +0000 Subject: [PATCH 29/59] #15836: Update reads, writes, and synchronize ttnn apis to take in sub device ids --- tests/ttnn/unit_tests/test_sub_device.py | 42 +++++++- ttnn/cpp/pybind11/device.cpp | 23 +++- ttnn/cpp/pybind11/events.cpp | 8 +- ttnn/cpp/pybind11/operations/core.hpp | 31 ++++-- ttnn/cpp/pybind11/pytensor.cpp | 35 +++++- ttnn/cpp/ttnn/events.cpp | 11 +- ttnn/cpp/ttnn/events.hpp | 5 +- ttnn/cpp/ttnn/operations/core/core.cpp | 36 +++++-- ttnn/cpp/ttnn/operations/core/core.hpp | 26 ++++- .../cpp/ttnn/operations/reduction/moe/moe.cpp | 3 +- ttnn/cpp/ttnn/tensor/tensor.cpp | 41 ++++--- ttnn/cpp/ttnn/tensor/tensor.hpp | 27 +++-- ttnn/cpp/ttnn/tensor/tensor_impl.cpp | 102 +++++++++++------- ttnn/cpp/ttnn/tensor/tensor_impl.hpp | 17 ++- ttnn/cpp/ttnn/tensor/tensor_ops.cpp | 64 +++++++---- ttnn/cpp/ttnn/tensor/tensor_ops.hpp | 17 ++- ttnn/ttnn/__init__.py | 2 + ttnn/ttnn/device.py | 3 + ttnn/ttnn/distributed/distributed.py | 12 ++- ttnn/ttnn/operations/core.py | 14 ++- 20 files changed, 378 insertions(+), 141 deletions(-) diff --git a/tests/ttnn/unit_tests/test_sub_device.py b/tests/ttnn/unit_tests/test_sub_device.py index 7d3f93797a7d..be2c81748709 100644 --- a/tests/ttnn/unit_tests/test_sub_device.py +++ b/tests/ttnn/unit_tests/test_sub_device.py @@ -29,7 +29,11 @@ def run_sub_devices(device): sub_device_manager1 = device.create_sub_device_manager([sub_device_1, sub_device_2], 3200) sub_device_manager2 = device.create_sub_device_manager([sub_device_2], 3200) device.load_sub_device_manager(sub_device_manager1) + ttnn.synchronize_devices(device, sub_device_ids=[ttnn.SubDeviceId(1)]) + ttnn.synchronize_devices(device, sub_device_ids=[ttnn.SubDeviceId(0), ttnn.SubDeviceId(1)]) + ttnn.synchronize_devices(device) device.load_sub_device_manager(sub_device_manager2) + ttnn.synchronize_devices(device, sub_device_ids=[ttnn.SubDeviceId(0)]) device.clear_loaded_sub_device_manager() device.remove_sub_device_manager(sub_device_manager1) device.remove_sub_device_manager(sub_device_manager2) @@ -48,16 +52,16 @@ def run_sub_devices_program(device): tensix_cores0 = ttnn.CoreRangeSet( { ttnn.CoreRange( - ttnn.CoreCoord(0, 0), - ttnn.CoreCoord(3, 3), + ttnn.CoreCoord(4, 4), + ttnn.CoreCoord(4, 4), ), } ) tensix_cores1 = ttnn.CoreRangeSet( { ttnn.CoreRange( - ttnn.CoreCoord(4, 4), - ttnn.CoreCoord(4, 4), + ttnn.CoreCoord(0, 0), + ttnn.CoreCoord(3, 3), ), } ) @@ -74,8 +78,19 @@ def run_sub_devices_program(device): device=device, memory_config=ttnn.L1_MEMORY_CONFIG, mesh_mapper=inputs_mesh_mapper, + sub_device_ids=[ttnn.SubDeviceId(0)], ) + xt_host = ttnn.from_torch( + x, + dtype=ttnn.bfloat16, + layout=ttnn.TILE_LAYOUT, + mesh_mapper=inputs_mesh_mapper, + sub_device_ids=[ttnn.SubDeviceId(1)], + ) + + ttnn.copy_host_to_device_tensor(xt_host, xt, sub_device_ids=[ttnn.SubDeviceId(1)]) + grid_size = device.compute_with_storage_grid_size() shard_size = [32, 64] shard_scheme = ttnn.TensorMemoryLayout.HEIGHT_SHARDED @@ -83,11 +98,28 @@ def run_sub_devices_program(device): yt = ttnn.interleaved_to_sharded( xt, grid_size, shard_size, shard_scheme, shard_orientation, output_dtype=ttnn.bfloat16 ) - y = ttnn.to_torch(yt, device=device, mesh_composer=output_mesh_composer) + y = ttnn.to_torch(yt, device=device, mesh_composer=output_mesh_composer, sub_device_ids=[ttnn.SubDeviceId(1)]) + + eq = torch.equal(x, y) + assert eq + + y = ttnn.to_torch(yt.cpu(sub_device_ids=[ttnn.SubDeviceId(0)]), mesh_composer=output_mesh_composer) eq = torch.equal(x, y) assert eq + event = ttnn.create_event(device) + + yt2 = ttnn.interleaved_to_sharded( + xt, grid_size, shard_size, shard_scheme, shard_orientation, output_dtype=ttnn.bfloat16 + ) + ttnn.record_event(0, event, [ttnn.SubDeviceId(1)]) + ttnn.wait_for_event(0, event) + y2 = ttnn.to_torch(yt2, device=device, mesh_composer=output_mesh_composer, sub_device_ids=[ttnn.SubDeviceId(0)]) + + eq = torch.equal(x, y2) + assert eq + device.clear_loaded_sub_device_manager() device.remove_sub_device_manager(sub_device_manager) diff --git a/ttnn/cpp/pybind11/device.cpp b/ttnn/cpp/pybind11/device.cpp index b60a36ed7adc..1196c1f08e01 100644 --- a/ttnn/cpp/pybind11/device.cpp +++ b/ttnn/cpp/pybind11/device.cpp @@ -100,6 +100,8 @@ void py_device_module_types(py::module& m_device) { py::class_(m_device, "SubDevice", "Class describing a sub-device of a Tenstorrent accelerator device."); + py::class_(m_device, "SubDeviceId", "ID of a sub-device."); + py::class_(m_device, "SubDeviceManagerId", "ID of a sub-device manager."); } @@ -114,6 +116,14 @@ void device_module(py::module& m_device) { The order of cores is Tensix, then Ethernet. )doc"); + auto pySubDeviceId = static_cast>(m_device.attr("SubDeviceId")); + pySubDeviceId.def( + py::init(), + py::arg("id"), + R"doc( + Creates a SubDeviceId object with the given ID. + )doc"); + auto pyDevice = static_cast>>(m_device.attr("Device")); pyDevice .def( @@ -482,10 +492,11 @@ void device_module(py::module& m_device) { m_device.def( "synchronize_device", - [](Device* device, const std::optional cq_id) { + [](Device* device, const std::optional cq_id, const std::vector& sub_device_ids) { // Send finish command to issue queue through worker thread // Worker thread will stall until the device is flushed. - device->push_work([device, cq_id]() mutable { Synchronize(device, cq_id); }); + device->push_work( + [device, cq_id, &sub_device_ids]() mutable { Synchronize(device, cq_id, sub_device_ids); }); // Main thread stalls until worker is complete (full device and worker queue flush). device->synchronize(); }, @@ -493,10 +504,13 @@ void device_module(py::module& m_device) { Synchronize the device with host by waiting for all operations to complete. If cq_id is provided then only the operations associated with that cq_id are waited for, otherwise operations for all command queues are waited on. + If the device has been configured with sub-devices, then sub_device_ids can be provided to only wait + for the operations that ran on the specified sub-devices, otherwise all sub-devices (the entire chip) are waited on. Args: device (ttnn.device.Device): The device to synchronize with. cq_id (int, optional): The command queue ID to synchronize. Defaults to `None`. + sub_device_ids (List[ttnn.SubDeviceId], optional): The sub-device IDs to synchronize. Defaults to all sub-devices. Returns: `None`: The op ensures that all operations are completed. @@ -508,7 +522,8 @@ void device_module(py::module& m_device) { >>> ttnn.synchronize_device(device) )doc", py::arg("device"), - py::arg("cq_id") = std::nullopt); + py::arg("cq_id") = std::nullopt, + py::arg("sub_device_ids") = std::vector()); m_device.def("SetLazyCommandQueueMode", &tt::tt_metal::detail::SetLazyCommandQueueMode, R"doc( If set to true, the host does not notify the device that there are commands available other than the FinishCommand. Once set to false, all subsequent commands will immediately notify the device @@ -527,6 +542,8 @@ void device_module(py::module& m_device) { m_device.attr("DEFAULT_L1_SMALL_SIZE") = py::int_(DEFAULT_L1_SMALL_SIZE); m_device.attr("DEFAULT_TRACE_REGION_SIZE") = py::int_(DEFAULT_TRACE_REGION_SIZE); + + m_device.attr("DefaultQueueId") = ttnn::DefaultQueueId; } void py_device_module(py::module& module) { diff --git a/ttnn/cpp/pybind11/events.cpp b/ttnn/cpp/pybind11/events.cpp index fdb12668f63c..4ce6d41e644c 100644 --- a/ttnn/cpp/pybind11/events.cpp +++ b/ttnn/cpp/pybind11/events.cpp @@ -31,15 +31,17 @@ void py_module(py::module& module) { module.def( "record_event", - py::overload_cast&>(&record_event), + py::overload_cast&, const std::vector&>(&record_event), py::arg("cq_id"), py::arg("event"), + py::arg("sub_device_ids") = std::vector(), R"doc( Record the completion of commands on this CQ, preceeding this call. Args: cq_id (int): The Command Queue on which event completion will be recorded. event (event): The event used to record completion of preceeding commands. + sub_device_ids (List[ttnn.SubDeviceId], optional): The sub-device IDs to record completion for. Defaults to all sub-devices. )doc"); module.def( @@ -69,9 +71,10 @@ void py_module(py::module& module) { module.def( "record_event", - py::overload_cast(&record_event), + py::overload_cast&>(&record_event), py::arg("cq_id"), py::arg("multi_device_event"), + py::arg("sub_device_ids") = std::vector(), R"doc( Record the completion of commands on this CQ, preceeding this call. @@ -91,6 +94,7 @@ void py_module(py::module& module) { Args: cq_id (int): The Command Queue on which event completion will be recorded. event (event): The event used to record completion of preceeding commands. + sub_device_ids (List[ttnn.SubDeviceId], optional): The sub-device IDs to record completion for. Defaults to all sub-devices. )doc"); } diff --git a/ttnn/cpp/pybind11/operations/core.hpp b/ttnn/cpp/pybind11/operations/core.hpp index 489ee330c5e0..db8a7a1970c8 100644 --- a/ttnn/cpp/pybind11/operations/core.hpp +++ b/ttnn/cpp/pybind11/operations/core.hpp @@ -65,19 +65,31 @@ void py_module(py::module& module) { module.def( "to_device", - py::overload_cast&>( - &ttnn::operations::core::to_device), + py::overload_cast< + const ttnn::Tensor&, + Device*, + const std::optional&, + uint8_t, + const std::vector&>(&ttnn::operations::core::to_device), py::arg("tensor"), py::arg("device"), - py::arg("memory_config") = std::nullopt); + py::arg("memory_config") = std::nullopt, + py::arg("cq_id") = ttnn::DefaultQueueId, + py::arg("sub_device_ids") = std::vector()); module.def( "to_device", - py::overload_cast&>( - &ttnn::operations::core::to_device), + py::overload_cast< + const ttnn::Tensor&, + MeshDevice*, + const std::optional&, + uint8_t, + const std::vector&>(&ttnn::operations::core::to_device), py::arg("tensor"), py::arg("device"), py::arg("memory_config") = std::nullopt, + py::arg("cq_id") = ttnn::DefaultQueueId, + py::arg("sub_device_ids") = std::vector(), R"doc( Copy tensor from host to device. @@ -85,6 +97,9 @@ void py_module(py::module& module) { tensor (ttnn.Tensor): The tensor to be copied from host to device. device (ttnn.Device | ttnn.MeshDevice): The target device where the tensor will be copied. memory_config (ttnn.MemoryConfig, optional): The memory configuration to use. Defaults to `None`. + cq_id (int, optional): The command queue ID to use. Defaults to `0`. + sub_device_ids (List[ttnn.SubDeviceId], optional): The sub-device IDs to wait on before writing the tensor to device memory. + If it is not provided, device will stall for all programs of the specified cq to finish before writing the tensor to device memory. Returns: ttnn.Tensor: The device tensor copy. @@ -103,6 +118,7 @@ void py_module(py::module& module) { py::arg("blocking") = true, py::kw_only(), py::arg("cq_id") = ttnn::DefaultQueueId, + py::arg("sub_device_ids") = std::vector(), R"doc( Copy tensor from device to host. @@ -112,6 +128,8 @@ void py_module(py::module& module) { Keyword args: cq_id (int, optional): the command queue ID to use. Defaults to `0`. + sub_device_ids (List[ttnn.SubDeviceId], optional): the sub-device IDs to wait on before reading the tensor from device memory. + If it is not provided, device will stall for all programs of the specified cq to finish before reading the tensor from device memory. Returns: ttnn.Tensor: the host tensor copy. @@ -243,7 +261,8 @@ void py_module(py::module& module) { &ttnn::operations::core::copy_host_to_device_tensor, py::arg("host_tensor"), py::arg("device_tensor"), - py::arg("cq_id") = ttnn::DefaultQueueId); + py::arg("cq_id") = ttnn::DefaultQueueId, + py::arg("sub_device_ids") = std::vector()); module.def( "begin_trace_capture", diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp index 8cd2e3da094d..48a360fb3cb2 100644 --- a/ttnn/cpp/pybind11/pytensor.cpp +++ b/ttnn/cpp/pybind11/pytensor.cpp @@ -919,15 +919,22 @@ void pytensor_module(py::module& m_tensor) { )doc") .def( "to", - py::overload_cast(&Tensor::to, py::const_), + py::overload_cast&>( + &Tensor::to, py::const_), py::arg("device").noconvert(), py::arg("mem_config").noconvert() = MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED}, + py::arg("cq_id") = ttnn::DefaultQueueId, + py::arg("sub_device_ids") = std::vector(), py::keep_alive<0, 2>(), R"doc( Move TT Tensor from host device to TT accelerator device. Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. + ``sub_device_ids`` can be used to specify which specific sub devices to wait on before writing the tensor to device memory. + + If it is not provided, device will stall for all programs of the specified cq to finish before writing the tensor to device memory. + If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ @@ -937,6 +944,10 @@ void pytensor_module(py::module& m_tensor) { +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ | arg1 | MemoryConfig of tensor of TT accelerator device | ttnn.MemoryConfig | | No | +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | arg2 | CQ ID of TT accelerator device to use | uint8_t | | No | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | arg3 | Sub device IDs to wait on before writing tensor | List[ttnn.SubDeviceId] | | No | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ .. code-block:: python @@ -950,15 +961,22 @@ void pytensor_module(py::module& m_tensor) { )doc") .def( "to", - py::overload_cast(&Tensor::to, py::const_), + py::overload_cast&>( + &Tensor::to, py::const_), py::arg("mesh_device").noconvert(), py::arg("mem_config").noconvert() = MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED}, + py::arg("cq_id") = ttnn::DefaultQueueId, + py::arg("sub_device_ids") = std::vector(), py::keep_alive<0, 2>(), R"doc( Move TT Tensor from host device to TT accelerator device. Only BFLOAT16 (in ROW_MAJOR or TILE layout) and BFLOAT8_B, BFLOAT4_B (in TILE layout) are supported on device. + ``sub_device_ids`` can be used to specify which specific sub devices to wait on before writing the tensor to device memory. + + If it is not provided, device will stall for all programs of the specified cq to finish before writing the tensor to device memory. + If ``arg1`` is not supplied, default ``MemoryConfig`` with ``interleaved`` set to ``True``. +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ @@ -968,6 +986,10 @@ void pytensor_module(py::module& m_tensor) { +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ | arg1 | MemoryConfig of tensor of TT accelerator device | ttnn.MemoryConfig | | No | +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | arg2 | CQ ID of TT accelerator device to use | uint8_t | | No | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ + | arg3 | Sub device IDs to wait before writing tensor | List[ttnn.SubDeviceId] | | No | + +-----------+-------------------------------------------------+----------------------------+-----------------------+----------+ .. code-block:: python @@ -1022,12 +1044,19 @@ void pytensor_module(py::module& m_tensor) { )doc") .def( "cpu", - [](const Tensor& self, bool blocking, uint8_t cq_id) { return self.cpu(blocking, cq_id); }, + [](const Tensor& self, bool blocking, uint8_t cq_id, const std::vector& sub_device_ids) { + return self.cpu(blocking, cq_id, sub_device_ids); + }, py::arg("blocking") = true, py::arg("cq_id") = ttnn::DefaultQueueId, + py::arg("sub_device_ids") = std::vector(), R"doc( Move TT Tensor from TT accelerator device to host device. + ``sub_device_ids`` can be used to specify which specific sub devices to wait on before reading the tensor from device memory. + + If it is not provided, device will stall waiting for all programs of the specified cq to finish before reading the tensor from device memory. + .. code-block:: python tt_tensor = tt_tensor.cpu() diff --git a/ttnn/cpp/ttnn/events.cpp b/ttnn/cpp/ttnn/events.cpp index 789cdd36c6e7..a38da21fccb4 100644 --- a/ttnn/cpp/ttnn/events.cpp +++ b/ttnn/cpp/ttnn/events.cpp @@ -29,9 +29,11 @@ std::shared_ptr create_event(Device* device) { return event; } -void record_event(uint8_t cq_id, const std::shared_ptr& event) { +void record_event(uint8_t cq_id, const std::shared_ptr& event, const std::vector& sub_device_ids) { Device* device = event->device; - device->push_work([device, event, cq_id] { EnqueueRecordEvent(device->command_queue(cq_id), event); }); + device->push_work([device, event, cq_id, sub_device_ids] { + EnqueueRecordEvent(device->command_queue(cq_id), event, sub_device_ids); + }); } void wait_for_event(uint8_t cq_id, const std::shared_ptr& event) { @@ -41,9 +43,10 @@ void wait_for_event(uint8_t cq_id, const std::shared_ptr& event) { MultiDeviceEvent create_event(MeshDevice* mesh_device) { return MultiDeviceEvent(mesh_device); } -void record_event(uint8_t cq_id, const MultiDeviceEvent& multi_device_event) { +void record_event( + uint8_t cq_id, const MultiDeviceEvent& multi_device_event, const std::vector& sub_device_ids) { for (auto& event : multi_device_event.events) { - record_event(cq_id, event); + record_event(cq_id, event, sub_device_ids); } } diff --git a/ttnn/cpp/ttnn/events.hpp b/ttnn/cpp/ttnn/events.hpp index 57405fa95261..d4c409338c69 100644 --- a/ttnn/cpp/ttnn/events.hpp +++ b/ttnn/cpp/ttnn/events.hpp @@ -16,11 +16,12 @@ struct MultiDeviceEvent { }; // Single Device APIs std::shared_ptr create_event(Device* device); -void record_event(uint8_t cq_id, const std::shared_ptr& event); +void record_event( + uint8_t cq_id, const std::shared_ptr& event, const std::vector& sub_device_ids = {}); void wait_for_event(uint8_t cq_id, const std::shared_ptr& event); // Multi Device APIs MultiDeviceEvent create_event(MeshDevice* mesh_device); -void record_event(uint8_t cq_id, const MultiDeviceEvent& event); +void record_event(uint8_t cq_id, const MultiDeviceEvent& event, const std::vector& sub_device_ids = {}); void wait_for_event(uint8_t cq_id, const MultiDeviceEvent& event); } // namespace ttnn::events diff --git a/ttnn/cpp/ttnn/operations/core/core.cpp b/ttnn/cpp/ttnn/operations/core/core.cpp index 184f6e139f13..90fc3f34908a 100644 --- a/ttnn/cpp/ttnn/operations/core/core.cpp +++ b/ttnn/cpp/ttnn/operations/core/core.cpp @@ -58,25 +58,34 @@ ttnn::Tensor squeeze_from_4D(const ttnn::Tensor& tensor, const int rank) { return ttnn::reshape(tensor, shape.to_rank(rank)); } -ttnn::Tensor to_device(const ttnn::Tensor& tensor, Device* device, const std::optional& memory_config) { +ttnn::Tensor to_device( + const ttnn::Tensor& tensor, + Device* device, + const std::optional& memory_config, + uint8_t cq_id, + const std::vector& sub_device_ids) { auto mem_config = memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG); if (mem_config.is_sharded() and (device->arch() == tt::ARCH::BLACKHOLE)) { - auto interleaved_tensor = tensor.to(device, ttnn::DRAM_MEMORY_CONFIG); + auto interleaved_tensor = tensor.to(device, ttnn::DRAM_MEMORY_CONFIG, cq_id, sub_device_ids); return ttnn::interleaved_to_sharded(ttnn::DefaultQueueId, interleaved_tensor, mem_config, std::nullopt); } else { - return tensor.to(device, memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG)); + return tensor.to(device, memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG), cq_id, sub_device_ids); } } ttnn::Tensor to_device( - const ttnn::Tensor& tensor, MeshDevice* mesh_device, const std::optional& memory_config) { + const ttnn::Tensor& tensor, + MeshDevice* mesh_device, + const std::optional& memory_config, + uint8_t cq_id, + const std::vector& sub_device_ids) { auto mem_config = memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG); // Currently no direct sharded write support in BLACKHOLE due to alignment issue if (mem_config.is_sharded() and (mesh_device->arch() == tt::ARCH::BLACKHOLE)) { - auto interleaved_tensor = tensor.to(mesh_device, ttnn::DRAM_MEMORY_CONFIG); + auto interleaved_tensor = tensor.to(mesh_device, ttnn::DRAM_MEMORY_CONFIG, cq_id, sub_device_ids); return ttnn::interleaved_to_sharded(ttnn::DefaultQueueId, interleaved_tensor, mem_config, std::nullopt); } else { - return tensor.to(mesh_device, mem_config); + return tensor.to(mesh_device, mem_config, cq_id, sub_device_ids); } } @@ -100,17 +109,22 @@ ttnn::Tensor allocate_tensor_on_device( shape, data_type, layout, mesh_device->get_devices(), memory_config.value_or(ttnn::DRAM_MEMORY_CONFIG)); } -void copy_host_to_device_tensor(const ttnn::Tensor& host_tensor, ttnn::Tensor device_tensor, uint8_t cq_id) { - tt::tt_metal::write_tensor(std::move(host_tensor), std::move(device_tensor), cq_id); +void copy_host_to_device_tensor( + const ttnn::Tensor& host_tensor, + ttnn::Tensor device_tensor, + uint8_t cq_id, + const std::vector& sub_device_ids) { + tt::tt_metal::write_tensor(std::move(host_tensor), std::move(device_tensor), cq_id, sub_device_ids); } -ttnn::Tensor from_device(const ttnn::Tensor& tensor, bool blocking, uint8_t cq_id) { +ttnn::Tensor from_device( + const ttnn::Tensor& tensor, bool blocking, uint8_t cq_id, const std::vector& sub_device_ids) { // Currently no direct sharded read support in BLACKHOLE due to alignment issue if (tensor.is_sharded() and (tensor.device()->arch() == tt::ARCH::BLACKHOLE)) { auto interleaved_tensor = ttnn::sharded_to_interleaved(cq_id, tensor, ttnn::DRAM_MEMORY_CONFIG, std::nullopt); - return interleaved_tensor.cpu(blocking, cq_id); + return interleaved_tensor.cpu(blocking, cq_id, sub_device_ids); } else { - return tensor.cpu(blocking, cq_id); + return tensor.cpu(blocking, cq_id, sub_device_ids); } } diff --git a/ttnn/cpp/ttnn/operations/core/core.hpp b/ttnn/cpp/ttnn/operations/core/core.hpp index e269e7030b68..d3ce90a4e24f 100644 --- a/ttnn/cpp/ttnn/operations/core/core.hpp +++ b/ttnn/cpp/ttnn/operations/core/core.hpp @@ -24,10 +24,19 @@ ttnn::Tensor unsqueeze_to_4D(const ttnn::Tensor& tensor); ttnn::Tensor squeeze_from_4D(const ttnn::Tensor& tensor, const int rank); -ttnn::Tensor to_device(const ttnn::Tensor& tensor, Device* device, const std::optional& memory_config); +ttnn::Tensor to_device( + const ttnn::Tensor& tensor, + Device* device, + const std::optional& memory_config, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& = {}); ttnn::Tensor to_device( - const ttnn::Tensor& tensor, MeshDevice* mesh_device, const std::optional& memory_config); + const ttnn::Tensor& tensor, + MeshDevice* mesh_device, + const std::optional& memory_config, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& = {}); ttnn::Tensor allocate_tensor_on_device( const Shape& shape, @@ -44,9 +53,16 @@ ttnn::Tensor allocate_tensor_on_device( const std::optional& memory_config); void copy_host_to_device_tensor( - const ttnn::Tensor& host_tensor, ttnn::Tensor device_tensor, uint8_t cq_id = ttnn::DefaultQueueId); - -ttnn::Tensor from_device(const ttnn::Tensor& tensor, bool blocking = true, uint8_t cq_id = ttnn::DefaultQueueId); + const ttnn::Tensor& host_tensor, + ttnn::Tensor device_tensor, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& sub_device_ids = {}); + +ttnn::Tensor from_device( + const ttnn::Tensor& tensor, + bool blocking = true, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& sub_device_ids = {}); void deallocate(Tensor& tensor, bool force = true); diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp b/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp index fcfd1f35e60a..dbf98519483e 100644 --- a/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/moe/moe.cpp @@ -39,9 +39,8 @@ auto MoeOperation::invoke( const uint16_t k, const std::optional& memory_config, std::optional optional_output_tensor) { - constexpr uint8_t DefaultQueueId = 0; return invoke( - DefaultQueueId, + ttnn::DefaultQueueId, input_tensor, expert_mask_tensor, topk_mask_tensor, diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp index 6f153e8e6b41..f4304d33c6a5 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor.cpp @@ -559,24 +559,28 @@ const Storage& Tensor::get_storage() const { return this->tensor_attributes->storage; } -Tensor Tensor::to(CommandQueue& queue, const MemoryConfig& mem_config) const { - return tensor_ops::tensor_to(*this, queue.device(), mem_config); +Tensor Tensor::to(Device* target_device, const MemoryConfig& mem_config,uint8_t cq_id, + const std::vector& sub_device_ids) const { + return tensor_ops::tensor_to(*this, target_device, mem_config, cq_id, sub_device_ids); } -Tensor Tensor::to(Device* target_device, const MemoryConfig& mem_config) const { - return tensor_ops::tensor_to(*this, target_device, mem_config); -} - -Tensor Tensor::to(distributed::MeshDevice* mesh_device, const MemoryConfig& mem_config) const { +Tensor Tensor::to(distributed::MeshDevice* mesh_device, const MemoryConfig& mem_config,uint8_t cq_id, + const std::vector& sub_device_ids) const { std::vector workers_to_use = ttnn::distributed::get_mapped_devices(*this, *mesh_device); - return tensor_ops::tensor_to(*this, workers_to_use, mem_config); + return tensor_ops::tensor_to(*this, workers_to_use, mem_config, cq_id, sub_device_ids); } -Tensor Tensor::to(const std::vector& workers, const MemoryConfig& mem_config) const { - return tensor_ops::tensor_to(*this, workers, mem_config); +Tensor Tensor::to( + const std::vector& workers, + const MemoryConfig& mem_config, + uint8_t cq_id, + const std::vector& sub_device_ids) const { + return tensor_ops::tensor_to(*this, workers, mem_config, cq_id, sub_device_ids); } -Tensor Tensor::cpu(bool blocking, uint8_t cq_id) const { return tensor_ops::tensor_cpu(*this, blocking, cq_id); } +Tensor Tensor::cpu(bool blocking, uint8_t cq_id, const std::vector& sub_device_ids) const { + return tensor_ops::tensor_cpu(*this, blocking, cq_id, sub_device_ids); +} Tensor Tensor::cpu_sharded() const { return tensor_ops::tensor_cpu_sharded(*this); } @@ -861,7 +865,8 @@ Tensor allocate_tensor_on_devices( return device_tensor; } -void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id) { +void write_tensor( + const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id, const std::vector& sub_device_ids) { // Top level wrapper to copy a host tensor to a preallocated device tensor TT_ASSERT(device_tensor.workers.size(), "Workers must be specified for device_tensor in write_tensor"); @@ -877,7 +882,7 @@ void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id for (int worker_index = 0; worker_index < device_tensor.workers.size(); ++worker_index) { auto& worker = device_tensor.workers[worker_index]; - worker->push_work([cq_id, worker, worker_index, async_safe_tensor, device_tensor]() mutable { + worker->push_work([cq_id, worker, worker_index, async_safe_tensor, device_tensor, sub_device_ids]() mutable { TT_FATAL( device_tensor.storage_type() == StorageType::DEVICE or device_tensor.storage_type() == StorageType::MULTI_DEVICE, @@ -889,7 +894,7 @@ void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id "Error"); std::visit( tt::stl::overloaded{ - [worker, worker_index, cq_id, &async_safe_tensor](const DeviceStorage& device_storage) { + [worker, worker_index, cq_id, &async_safe_tensor, sub_device_ids](const DeviceStorage& device_storage) { // Copying from host to a single device. void* host_data = std::visit( tt::stl::overloaded{ @@ -913,9 +918,10 @@ void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id worker->command_queue(cq_id), device_storage.get_buffer(), host_data, - /*blocking=*/false); + /*blocking=*/false, + sub_device_ids); }, - [worker, worker_index, cq_id, &async_safe_tensor](const MultiDeviceStorage& device_storage) { + [worker, worker_index, cq_id, &async_safe_tensor, sub_device_ids](const MultiDeviceStorage& device_storage) { // Copying from host to multi-device. TT_ASSERT( std::holds_alternative(async_safe_tensor.get_storage()), @@ -928,7 +934,8 @@ void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id worker->command_queue(cq_id), device_storage.get_buffer_for_device(worker), host_data, - /*blocking=*/false); + /*blocking=*/false, + sub_device_ids); }, [](auto&& s) { TT_THROW("Unreachable"); }}, device_tensor.get_storage()); diff --git a/ttnn/cpp/ttnn/tensor/tensor.hpp b/ttnn/cpp/ttnn/tensor/tensor.hpp index bf86cca99c19..b8b7a993b8a6 100644 --- a/ttnn/cpp/ttnn/tensor/tensor.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor.hpp @@ -141,19 +141,21 @@ struct Tensor { Tensor to( Device* target_device, - const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) const; + const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& sub_device_ids = {}) const; Tensor to( distributed::MeshDevice* mesh_device, - const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) const; - - Tensor to( - CommandQueue& queue, - const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) const; + const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& sub_device_ids = {}) const; Tensor to( const std::vector& workers, - const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}) const; + const MemoryConfig& mem_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& sub_device_ids = {}) const; Tensor to(Layout target_layout, Device* worker = nullptr) const; @@ -164,7 +166,10 @@ struct Tensor { const ttnn::SimpleShape& input_tensor_start, float pad_value) const; - Tensor cpu(bool blocking = true, uint8_t cq_id = ttnn::DefaultQueueId) const; + Tensor cpu( + bool blocking = true, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& sub_device_ids = {}) const; Tensor cpu_sharded() const; @@ -374,7 +379,11 @@ Tensor allocate_tensor_on_devices( const std::vector& devices, const MemoryConfig& memory_config = {.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}, const std::optional& tile = std::nullopt); -void write_tensor(const Tensor& host_tensor, Tensor device_tensor, uint8_t cq_id = ttnn::DefaultQueueId); +void write_tensor( + const Tensor& host_tensor, + Tensor device_tensor, + uint8_t cq_id = ttnn::DefaultQueueId, + const std::vector& sub_device_ids = {}); Tensor set_tensor_id(const Tensor& tensor); diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp index 0386f6e353c1..dc7545ac0e5b 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp @@ -565,7 +565,11 @@ std::string to_string(const Tensor& tensor, std::optional o // ====================================================================================== template -Tensor to_host_helper(const Tensor& tensor, bool blocking = true, uint8_t cq_id = ttnn::DefaultQueueId) { +Tensor to_host_helper( + const Tensor& tensor, + bool blocking = true, + uint8_t cq_id = ttnn::DefaultQueueId, + tt::stl::Span sub_device_ids = {}) { TT_ASSERT(tensor.is_allocated(), "Buffer must be allocated on device!"); auto device_buffer = tensor.device_buffer(); auto device = tensor.device(); @@ -575,7 +579,8 @@ Tensor to_host_helper(const Tensor& tensor, bool blocking = true, uint8_t cq_id const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { data_vec.resize(size_in_bytes / sizeof(T)); - read_data_from_device_buffer(device->command_queue(cq_id), device_buffer, data_vec.data(), blocking); + read_data_from_device_buffer( + device->command_queue(cq_id), device_buffer, data_vec.data(), blocking, sub_device_ids); } else { read_data_from_device_buffer(device_buffer, data_vec); } @@ -584,9 +589,9 @@ Tensor to_host_helper(const Tensor& tensor, bool blocking = true, uint8_t cq_id } template -Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { +Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids) { if (tensor.storage_type() == StorageType::DEVICE) { - return to_host_helper(tensor, blocking, cq_id); + return to_host_helper(tensor, blocking, cq_id, sub_device_ids); } else if (tensor.storage_type() == StorageType::MULTI_DEVICE) { auto devices = get_devices(tensor); Tensor host_tensor(devices.size()); @@ -594,7 +599,7 @@ Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { for (int device_index = 0; device_index < devices.size(); ++device_index) { const auto& device = devices[device_index]; auto shard = get_shard_for_device(tensor, device); - shard = to_host_helper(shard, blocking, cq_id); + shard = to_host_helper(shard, blocking, cq_id, sub_device_ids); insert_buffer_and_shape_for_device(device, shard, host_tensor, device_index); } return host_tensor; @@ -603,21 +608,29 @@ Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { } } -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); -template Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id); +template Tensor to_host( + const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids); +template Tensor to_host( + const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids); +template Tensor to_host( + const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids); +template Tensor to_host( + const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids); +template Tensor to_host( + const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids); +template Tensor to_host( + const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids); template <> -Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { - return to_host(tensor, blocking, cq_id); +Tensor to_host( + const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids) { + return to_host(tensor, blocking, cq_id, sub_device_ids); } template <> -Tensor to_host(const Tensor& tensor, bool blocking, uint8_t cq_id) { - return to_host(tensor, blocking, cq_id); +Tensor to_host( + const Tensor& tensor, bool blocking, uint8_t cq_id, tt::stl::Span sub_device_ids) { + return to_host(tensor, blocking, cq_id, sub_device_ids); } // ====================================================================================== @@ -662,7 +675,11 @@ Tensor to_host_sharded(const Tensor& tensor) { // ====================================================================================== template typename BufferType> -void write_data_to_device_buffer(CommandQueue& cq, const BufferType& host_buffer, DeviceBuffer device_buffer) { +void write_data_to_device_buffer( + CommandQueue& cq, + const BufferType& host_buffer, + DeviceBuffer device_buffer, + tt::stl::Span sub_device_ids) { ZoneScoped; // TODO(arakhmati): can we use generators in this function to go from `data_to_write` to `uint32_data`? // And effectively get rid of any additional allocation @@ -676,12 +693,12 @@ void write_data_to_device_buffer(CommandQueue& cq, const BufferType& host_buf const uint32_t* borrowed_buf_base = static_cast(host_buffer.data()); std::vector owned_copy_vec(borrowed_buf_base, borrowed_buf_base + borrowed_buf_size_words); owned_buffer::Buffer owned_copy(std::make_shared>(owned_copy_vec)); - EnqueueWriteBuffer(cq, device_buffer, owned_copy.get_ptr(), false); + EnqueueWriteBuffer(cq, device_buffer, owned_copy.get_ptr(), false, sub_device_ids); } else if constexpr (std::is_same_v, owned_buffer::Buffer>) { - EnqueueWriteBuffer(cq, device_buffer, host_buffer.get_ptr(), false); + EnqueueWriteBuffer(cq, device_buffer, host_buffer.get_ptr(), false, sub_device_ids); } } else { - EnqueueWriteBuffer(cq, device_buffer, host_buffer.data(), false); + EnqueueWriteBuffer(cq, device_buffer, host_buffer.data(), false, sub_device_ids); } } @@ -699,7 +716,8 @@ DeviceBuffer initialize_data_on_device( BufferType& data_to_write, Device* device, const TensorSpec& tensor_spec, - std::optional> queue = std::nullopt) { + uint8_t cq_id = ttnn::DefaultQueueId, + tt::stl::Span sub_device_ids = {}) { ZoneScoped; TT_ASSERT(device != nullptr); @@ -707,8 +725,7 @@ DeviceBuffer initialize_data_on_device( const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE"); if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) { - write_data_to_device_buffer( - queue.has_value() ? queue.value().get() : device->command_queue(), data_to_write, device_buffer); + write_data_to_device_buffer(device->command_queue(cq_id), data_to_write, device_buffer, sub_device_ids); } else { write_data_to_device_buffer(data_to_write, *device_buffer); } @@ -720,13 +737,14 @@ DeviceBuffer to_device_buffer( const Storage& storage, Device* device, const TensorSpec& tensor_spec, - std::optional> queue) { + uint8_t cq_id, + tt::stl::Span sub_device_ids) { return std::visit( - [&device, &tensor_spec, &queue](auto&& storage) -> DeviceBuffer { + [&device, &tensor_spec, cq_id, sub_device_ids](auto&& storage) -> DeviceBuffer { using StorageType = std::decay_t; if constexpr (std::is_same_v or std::is_same_v) { auto data_to_write = host_buffer::get_as(storage.buffer); - return initialize_data_on_device(data_to_write, device, tensor_spec, queue); + return initialize_data_on_device(data_to_write, device, tensor_spec, cq_id, sub_device_ids); } else if constexpr (std::is_same_v) { TT_THROW("Device storage doesn't support to_device_buffer"); } else if constexpr (std::is_same_v) { @@ -749,7 +767,8 @@ Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue) { + uint8_t cq_id, + tt::stl::Span sub_device_ids) { TT_FATAL(tensor.storage_type() != StorageType::DEVICE, "Tensor is already on device!"); if (tensor.storage_type() == StorageType::OWNED) { TT_FATAL(tensor.is_allocated(), "Need host buffer on device to exist to copy data to device!"); @@ -759,7 +778,8 @@ Tensor to_device( TensorSpec tensor_spec( tensor.get_logical_shape(), tensor.get_tensor_spec().tensor_layout().with_memory_config(memory_config)); - auto device_buffer = tensor_impl::to_device_buffer(tensor.get_storage(), target_device, tensor_spec, queue); + auto device_buffer = + tensor_impl::to_device_buffer(tensor.get_storage(), target_device, tensor_spec, cq_id, sub_device_ids); return Tensor(DeviceStorage{device_buffer}, tensor_spec); } @@ -767,40 +787,47 @@ template Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue); + uint8_t cq_id, + tt::stl::Span sub_device_ids); template Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue); + uint8_t cq_id, + tt::stl::Span sub_device_ids); template Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue); + uint8_t cq_id, + tt::stl::Span sub_device_ids); template Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue); + uint8_t cq_id, + tt::stl::Span sub_device_ids); template Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue); + uint8_t cq_id, + tt::stl::Span sub_device_ids); template Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue); + uint8_t cq_id, + tt::stl::Span sub_device_ids); template <> Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue) { - return to_device(tensor, target_device, memory_config, queue); + uint8_t cq_id, + tt::stl::Span sub_device_ids) { + return to_device(tensor, target_device, memory_config, cq_id, sub_device_ids); } template <> @@ -808,8 +835,9 @@ Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue) { - return to_device(tensor, target_device, memory_config, queue); + uint8_t cq_id, + tt::stl::Span sub_device_ids) { + return to_device(tensor, target_device, memory_config, cq_id, sub_device_ids); } // ====================================================================================== diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp index 5a0ec30ecddf..87c34bdb1998 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp @@ -167,8 +167,12 @@ DeviceBuffer allocate_buffer_on_device(Device* device, const TensorSpec& tensor_ template inline void read_data_from_device_buffer( - CommandQueue& cq, DeviceBuffer device_buffer, void* host_buffer_data, bool blocking) { - EnqueueReadBuffer(cq, device_buffer, host_buffer_data, blocking); + CommandQueue& cq, + DeviceBuffer device_buffer, + void* host_buffer_data, + bool blocking, + tt::stl::Span sub_device_ids = {}) { + EnqueueReadBuffer(cq, device_buffer, host_buffer_data, blocking, sub_device_ids); } template @@ -181,7 +185,11 @@ inline void read_data_from_device_buffer(DeviceBuffer device_buffer, std::vector // ====================================================================================== template -Tensor to_host(const Tensor& tensor, bool blocking = true, uint8_t cq_id = ttnn::DefaultQueueId); +Tensor to_host( + const Tensor& tensor, + bool blocking = true, + uint8_t cq_id = ttnn::DefaultQueueId, + tt::stl::Span sub_device_ids = {}); template Tensor to_host_sharded(const Tensor& tensor); @@ -191,7 +199,8 @@ Tensor to_device( const Tensor& tensor, Device* target_device, const MemoryConfig& memory_config, - std::optional> queue); + uint8_t cq_id = ttnn::DefaultQueueId, + tt::stl::Span sub_device_ids = {}); template Tensor to_layout(const Tensor& tensor, Layout target_layout); diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp index 460f8b0d5dbc..f40690d2a445 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp @@ -24,7 +24,12 @@ namespace tt::tt_metal::tensor_ops { -Tensor tensor_to(const Tensor& input_tensor, Device* target_device, const MemoryConfig& mem_config) { +Tensor tensor_to( + const Tensor& input_tensor, + Device* target_device, + const MemoryConfig& mem_config, + uint8_t cq_id, + const std::vector& sub_device_ids) { ZoneScoped; GraphTracker::instance().track_function_start("Tensor::to", input_tensor, target_device, mem_config); // Tensor can be using borrowed storage. If so, when running in async mode, copy this tensor to owned storage. @@ -35,7 +40,12 @@ Tensor tensor_to(const Tensor& input_tensor, Device* target_device, const Memory // Record main thread ref count for tensors before pushing to queue. uint32_t device_tensor_ref_count = device_tensor.tensor_attributes->record_main_thread_ref_count(); uint32_t original_tensor_ref_count = async_safe_tensor.tensor_attributes->record_main_thread_ref_count(); - target_device->push_work([async_safe_tensor, device_tensor, mem_config, target_device]() mutable { + target_device->push_work([async_safe_tensor, + device_tensor, + mem_config, + target_device, + cq_id, + sub_device_ids]() mutable { if (async_safe_tensor.storage_type() == StorageType::DEVICE) { TT_ASSERT(async_safe_tensor.device() == target_device && "Currently do not support moving between devices"); device_tensor.populate_buffers_and_metadata(async_safe_tensor); @@ -46,7 +56,7 @@ Tensor tensor_to(const Tensor& input_tensor, Device* target_device, const Memory async_safe_tensor.get_dtype(), async_safe_tensor.get_layout()); auto local_tensor = - tensor_impl::to_device_wrapper(async_safe_tensor, target_device, mem_config, std::nullopt); + tensor_impl::to_device_wrapper(async_safe_tensor, target_device, mem_config, cq_id, sub_device_ids); // Populate device tensor device_tensor.populate_buffers_and_metadata(local_tensor); } @@ -61,7 +71,12 @@ Tensor tensor_to(const Tensor& input_tensor, Device* target_device, const Memory return device_tensor; } -Tensor tensor_to(const Tensor& input_tensor, const std::vector& workers, const MemoryConfig& mem_config) { +Tensor tensor_to( + const Tensor& input_tensor, + const std::vector& workers, + const MemoryConfig& mem_config, + uint8_t cq_id, + const std::vector& sub_device_ids) { ZoneScoped; GraphTracker::instance().track_function_start("Tensor::to", input_tensor, workers, mem_config); TT_FATAL( @@ -72,10 +87,17 @@ Tensor tensor_to(const Tensor& input_tensor, const std::vector& workers uint32_t num_workers = workers.size(); for (int worker_index = 0; worker_index < workers.size(); ++worker_index) { auto& worker = workers[worker_index]; - worker->push_work([worker, input_tensor, device_tensor, mem_config, num_workers, worker_index]() mutable { + worker->push_work([worker, + input_tensor, + device_tensor, + mem_config, + num_workers, + worker_index, + cq_id, + sub_device_ids]() mutable { auto shard = get_shard_for_device(input_tensor, worker, worker_index); if (shard.storage_type() == StorageType::OWNED) { - shard = tensor_impl::to_device_wrapper(shard, worker, mem_config, std::nullopt); + shard = tensor_impl::to_device_wrapper(shard, worker, mem_config, cq_id, sub_device_ids); } insert_buffer_and_shape_for_device(worker, shard, device_tensor, worker_index); uint32_t num_workers_completed = (device_tensor.tensor_attributes->num_workers_completed)++; @@ -93,7 +115,8 @@ Tensor tensor_to(const Tensor& input_tensor, const std::vector& workers return device_tensor; } -Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, uint8_t cq_id) { +Tensor tensor_cpu( + const Tensor& input_tensor, bool blocking, uint8_t cq_id, const std::vector& sub_device_ids) { ZoneScoped; GraphTracker::instance().track_function_start("Tensor::cpu", input_tensor, blocking); auto workers = input_tensor.get_workers(blocking); @@ -111,19 +134,20 @@ Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, uint8_t cq_id) { uint32_t original_tensor_ref_count = input_tensor.tensor_attributes->record_main_thread_ref_count(); for (int worker_index = 0; worker_index < workers.size(); worker_index++) { auto target_device = workers[worker_index]; - target_device->push_work([host_tensor, blocking, target_device, input_tensor, worker_index, cq_id]() mutable { - TT_ASSERT( - input_tensor.storage_type() == StorageType::DEVICE or - input_tensor.storage_type() == StorageType::MULTI_DEVICE, - "Can only use worker queue for cpu call if tensor is on device."); - auto shard = get_shard_for_device(input_tensor, target_device); - shard = tensor_impl::to_host_wrapper(shard, blocking, cq_id); - insert_buffer_and_shape_for_device(target_device, shard, host_tensor, worker_index); - uint32_t num_workers_completed = (host_tensor.tensor_attributes->num_workers_completed)++; - if (not num_workers_completed) { - host_tensor.set_tensor_spec(input_tensor.get_tensor_spec()); - } - }); + target_device->push_work( + [host_tensor, blocking, target_device, input_tensor, worker_index, cq_id, sub_device_ids]() mutable { + TT_ASSERT( + input_tensor.storage_type() == StorageType::DEVICE or + input_tensor.storage_type() == StorageType::MULTI_DEVICE, + "Can only use worker queue for cpu call if tensor is on device."); + auto shard = get_shard_for_device(input_tensor, target_device); + shard = tensor_impl::to_host_wrapper(shard, blocking, cq_id, sub_device_ids); + insert_buffer_and_shape_for_device(target_device, shard, host_tensor, worker_index); + uint32_t num_workers_completed = (host_tensor.tensor_attributes->num_workers_completed)++; + if (not num_workers_completed) { + host_tensor.set_tensor_spec(input_tensor.get_tensor_spec()); + } + }); } if (blocking) { diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.hpp b/ttnn/cpp/ttnn/tensor/tensor_ops.hpp index 98f8103c151c..b8edff425f8b 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_ops.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_ops.hpp @@ -20,15 +20,26 @@ class Device; namespace tt::tt_metal::tensor_ops { -Tensor tensor_to(const Tensor& input_tensor, Device* target_device, const MemoryConfig& mem_config); +Tensor tensor_to( + const Tensor& input_tensor, + Device* target_device, + const MemoryConfig& mem_config, + uint8_t cq_id, + const std::vector& sub_device_ids); -Tensor tensor_to(const Tensor& input_tensor, const std::vector& workers, const MemoryConfig& mem_config); +Tensor tensor_to( + const Tensor& input_tensor, + const std::vector& workers, + const MemoryConfig& mem_config, + uint8_t cq_id, + const std::vector& sub_device_ids); Tensor tensor_to(const Tensor& input_tensor, Layout target_layout, Device* worker); Tensor tensor_to(const Tensor& input_tensor, Layout target_layout, distributed::MeshDevice* mesh_device); -Tensor tensor_cpu(const Tensor& input_tensor, bool blocking, uint8_t cq_id); +Tensor tensor_cpu( + const Tensor& input_tensor, bool blocking, uint8_t cq_id, const std::vector& sub_device_ids); Tensor tensor_cpu_sharded(const Tensor& input_tensor); diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py index 041de7280180..4f613ca11ef8 100644 --- a/ttnn/ttnn/__init__.py +++ b/ttnn/ttnn/__init__.py @@ -187,7 +187,9 @@ def manage_config(name, value): format_output_tensor, pad_to_tile_shape, SubDevice, + SubDeviceId, SubDeviceManagerId, + DefaultQueueId, init_device_compute_kernel_config, ) diff --git a/ttnn/ttnn/device.py b/ttnn/ttnn/device.py index b8de80cd87aa..6cbfaa85ead7 100644 --- a/ttnn/ttnn/device.py +++ b/ttnn/ttnn/device.py @@ -162,6 +162,9 @@ def is_blackhole(device=None): pad_to_tile_shape = ttnn._ttnn.device.pad_to_tile_shape SubDevice = ttnn._ttnn.device.SubDevice +SubDeviceId = ttnn._ttnn.device.SubDeviceId SubDeviceManagerId = ttnn._ttnn.device.SubDeviceManagerId +DefaultQueueId = ttnn._ttnn.device.DefaultQueueId + __all__ = [] diff --git a/ttnn/ttnn/distributed/distributed.py b/ttnn/ttnn/distributed/distributed.py index bda36c5b0f5b..65a902d11cf8 100644 --- a/ttnn/ttnn/distributed/distributed.py +++ b/ttnn/ttnn/distributed/distributed.py @@ -209,19 +209,23 @@ def create_mesh_device( close_mesh_device(mesh_device) -def synchronize_devices(devices: Union["ttnn.Device", "ttnn.MeshDevice"], queue_id: Optional[int] = None) -> None: +def synchronize_devices( + devices: Union["ttnn.Device", "ttnn.MeshDevice"], + queue_id: Optional[int] = ttnn.DefaultQueueId, + sub_device_ids: List[ttnn.SubDeviceId] = [], +) -> None: """ - synchronize_devices(devices: Union[ttnn.Device, ttnn.MeshDevice], queue_id: Optional[int] = None) -> None: + synchronize_devices(devices: Union[ttnn.Device, ttnn.MeshDevice], queue_id: Optional[int] = None, sub_device_ids: List[ttnn.SubDeviceId] = []) -> None: Synchronize the devices with host by waiting for all operations to complete. If queue_id is provided then only the operations associated with that queue_id are waited for, otherwise operations for all command queues are waited on. """ if isinstance(devices, ttnn.Device): - ttnn._ttnn.device.synchronize_device(devices, queue_id) + ttnn._ttnn.device.synchronize_device(devices, queue_id, sub_device_ids) else: for device in devices.get_device_ids(): - ttnn._ttnn.device.synchronize_device(devices.get_device(device), queue_id) + ttnn._ttnn.device.synchronize_device(devices.get_device(device), queue_id, sub_device_ids) class TensorToMesh: diff --git a/ttnn/ttnn/operations/core.py b/ttnn/ttnn/operations/core.py index 3eeda3a90b6a..24480037a3ff 100644 --- a/ttnn/ttnn/operations/core.py +++ b/ttnn/ttnn/operations/core.py @@ -4,7 +4,7 @@ import math import pathlib -from typing import Any, Callable, Dict, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch import ttnn.decorators @@ -158,6 +158,8 @@ def from_torch( device: Optional[ttnn.Device] = None, memory_config: Optional[ttnn.MemoryConfig] = None, mesh_mapper: Optional[ttnn.TensorToMesh] = None, + cq_id: Optional[int] = ttnn.DefaultQueueId, + sub_device_ids: List[ttnn.SubDeviceId] = [], ) -> ttnn.Tensor: """ Converts the `torch.Tensor` tensor into a `ttnn.Tensor`. For bfloat8_b or bfloat4_b format, the function itself is called twice, @@ -176,6 +178,8 @@ def from_torch( device (ttnn.Device, optional): the desired `ttnn` device. Defaults to `None`. memory_config (ttnn.MemoryConfig, optional): The desired `ttnn` memory configuration. Defaults to `None`. mesh_mapper (ttnn.TensorToMesh, optional): The desired `ttnn` mesh mapper. Defaults to `None`. + cq_id (int, optional): The command queue ID to use. Defaults to `0`. + sub_device_ids (List[ttnn.SubDeviceId], optional): The sub-device IDs to wait on. Defaults to all sub-devices. Returns: ttnn.Tensor: The resulting `ttnn` tensor. @@ -225,7 +229,7 @@ def from_torch( if device is not None: if memory_config is None: memory_config = ttnn.DRAM_MEMORY_CONFIG - tensor = ttnn.to_device(tensor, device, memory_config=memory_config) + tensor = ttnn.to_device(tensor, device, memory_config=memory_config, cq_id=cq_id, sub_device_ids=sub_device_ids) if shape_with_padding is not None and shape_with_padding != tensor.shape and mesh_mapper is None: tensor = ttnn.reshape(tensor, shape_with_padding) @@ -262,7 +266,8 @@ def to_torch( torch_rank: Optional[int] = None, mesh_composer: Optional[ttnn.MeshToTensor] = None, device: Optional[ttnn.Device] = None, - cq_id: Optional[int] = 0, + cq_id: Optional[int] = ttnn.DefaultQueueId, + sub_device_ids: List[ttnn.SubDeviceId] = [], ) -> "torch.Tensor": """ Converts the `ttnn.Tensor` tensor into a `torch.Tensor`. It does not call to_layout for bfloat8_b or bfloat4_b as we now convert @@ -278,6 +283,7 @@ def to_torch( mesh_composer (ttnn.MeshToTensor, optional): The desired `ttnn` mesh composer. Defaults to `None`. device (ttnn.Device, optional): The `ttnn` device of the input tensor. Defaults to `None`. cq_id (int, optional): The command queue ID to use. Defaults to `0`. + sub_device_ids (List[ttnn.SubDeviceId], optional): The sub-device IDs to wait on. Defaults to all sub-devices. Returns: torch.Tensor: The converted `torch` tensor. @@ -290,7 +296,7 @@ def to_torch( [ 0.9023, -0.5820, 0.5312]], dtype=torch.bfloat16) """ if ttnn.is_tensor_storage_on_device(tensor): - tensor = ttnn.from_device(tensor, cq_id=cq_id) + tensor = ttnn.from_device(tensor, cq_id=cq_id, sub_device_ids=sub_device_ids) if (tensor.layout != ttnn.ROW_MAJOR_LAYOUT) and not ( tensor.dtype == ttnn.bfloat8_b or tensor.dtype == ttnn.bfloat4_b From d1a90141331dd127bd06b2a8e0123941f1fb2a1a Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Sun, 8 Dec 2024 20:01:25 +0000 Subject: [PATCH 30/59] #15836: Add api passing different sub device configurations per device in a mesh_device --- tests/ttnn/unit_tests/test_sub_device.py | 30 +++++++++++++------ tt_metal/distributed/mesh_device.cpp | 18 +++++++++++ tt_metal/distributed/mesh_device.hpp | 2 ++ .../ttnn/distributed/distributed_pybind.cpp | 20 +++++++++++++ 4 files changed, 61 insertions(+), 9 deletions(-) diff --git a/tests/ttnn/unit_tests/test_sub_device.py b/tests/ttnn/unit_tests/test_sub_device.py index be2c81748709..f7bfb20401a6 100644 --- a/tests/ttnn/unit_tests/test_sub_device.py +++ b/tests/ttnn/unit_tests/test_sub_device.py @@ -7,7 +7,7 @@ import ttnn -def run_sub_devices(device): +def run_sub_devices(device, replicate_sub_devices=False): tensix_cores0 = ttnn.CoreRangeSet( { ttnn.CoreRange( @@ -26,8 +26,14 @@ def run_sub_devices(device): ) sub_device_1 = ttnn.SubDevice([tensix_cores0]) sub_device_2 = ttnn.SubDevice([tensix_cores1]) - sub_device_manager1 = device.create_sub_device_manager([sub_device_1, sub_device_2], 3200) - sub_device_manager2 = device.create_sub_device_manager([sub_device_2], 3200) + sub_devices_1 = [sub_device_1, sub_device_2] + sub_devices_2 = [sub_device_2] + if replicate_sub_devices: + num_devices = 1 if isinstance(device, ttnn.Device) else device.get_num_devices() + sub_devices_1 = [sub_devices_1] * num_devices + sub_devices_2 = [sub_devices_2] * num_devices + sub_device_manager1 = device.create_sub_device_manager(sub_devices_1, 3200) + sub_device_manager2 = device.create_sub_device_manager(sub_devices_2, 3200) device.load_sub_device_manager(sub_device_manager1) ttnn.synchronize_devices(device, sub_device_ids=[ttnn.SubDeviceId(1)]) ttnn.synchronize_devices(device, sub_device_ids=[ttnn.SubDeviceId(0), ttnn.SubDeviceId(1)]) @@ -39,7 +45,7 @@ def run_sub_devices(device): device.remove_sub_device_manager(sub_device_manager2) -def run_sub_devices_program(device): +def run_sub_devices_program(device, replicate_sub_devices=False): is_mesh_device = isinstance(device, ttnn.MeshDevice) if is_mesh_device: inputs_mesh_mapper = ttnn.ShardTensorToMesh(device, dim=0) @@ -67,7 +73,11 @@ def run_sub_devices_program(device): ) sub_device_1 = ttnn.SubDevice([tensix_cores0]) sub_device_2 = ttnn.SubDevice([tensix_cores1]) - sub_device_manager = device.create_sub_device_manager([sub_device_1, sub_device_2], 3200) + sub_devices = [sub_device_1, sub_device_2] + if replicate_sub_devices: + num_devices = 1 if isinstance(device, ttnn.Device) else device.get_num_devices() + sub_devices = [sub_devices] * num_devices + sub_device_manager = device.create_sub_device_manager(sub_devices, 3200) device.load_sub_device_manager(sub_device_manager) x = torch.randn(num_devices, 1, 64, 64, dtype=torch.bfloat16) @@ -130,8 +140,9 @@ def test_sub_devices(device, enable_async_mode): @pytest.mark.parametrize("enable_async_mode", (False, True), indirect=True) -def test_sub_devices_mesh(mesh_device, enable_async_mode): - run_sub_devices(mesh_device) +@pytest.mark.parametrize("replicate_sub_devices", (False, True)) +def test_sub_devices_mesh(mesh_device, replicate_sub_devices, enable_async_mode): + run_sub_devices(mesh_device, replicate_sub_devices) @pytest.mark.parametrize("enable_async_mode", (False, True), indirect=True) @@ -140,5 +151,6 @@ def test_sub_device_program(device, enable_async_mode): @pytest.mark.parametrize("enable_async_mode", (False, True), indirect=True) -def test_sub_device_program_mesh(mesh_device, enable_async_mode): - run_sub_devices_program(mesh_device) +@pytest.mark.parametrize("replicate_sub_devices", (False, True)) +def test_sub_device_program_mesh(mesh_device, replicate_sub_devices, enable_async_mode): + run_sub_devices_program(mesh_device, replicate_sub_devices) diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index dc0275cd26a2..6971abd948ee 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -489,6 +489,24 @@ MeshSubDeviceManagerId MeshDevice::create_sub_device_manager(tt::stl::Span>& mesh_sub_devices, DeviceAddr local_l1_size) { + MeshSubDeviceManagerId mesh_sub_device_manager_id(*this); + TT_FATAL(mesh_sub_devices.size() == this->num_devices(), "Number of devices does not match number of sub-device configurations"); + for (uint32_t i = 0; i < this->num_devices(); i++) { + auto* device = this->devices[i]; + auto& sub_device_manager_id = mesh_sub_device_manager_id.sub_device_manager_ids[i]; + tt::stl::Span sub_devices(mesh_sub_devices[i]); + device->push_work([device, sub_devices, local_l1_size, &sub_device_manager_id]() { + sub_device_manager_id = device->create_sub_device_manager(sub_devices, local_l1_size); + }); + } + for (auto* device : this->devices) { + device->synchronize(); + } + return mesh_sub_device_manager_id; +} + void MeshDevice::load_sub_device_manager(MeshSubDeviceManagerId mesh_sub_device_manager_id) { for (uint32_t i = 0; i < this->num_devices(); i++) { auto* device = this->devices[i]; diff --git a/tt_metal/distributed/mesh_device.hpp b/tt_metal/distributed/mesh_device.hpp index e7a0ab22db89..a7727fb97bd9 100644 --- a/tt_metal/distributed/mesh_device.hpp +++ b/tt_metal/distributed/mesh_device.hpp @@ -137,6 +137,8 @@ class MeshDevice : public std::enable_shared_from_this { MeshSubDeviceManagerId create_sub_device_manager( tt::stl::Span sub_devices, DeviceAddr local_l1_size); + MeshSubDeviceManagerId create_sub_device_manager( + const std::vector>& mesh_sub_devices, DeviceAddr local_l1_size); void load_sub_device_manager(MeshSubDeviceManagerId mesh_sub_device_manager_id); void clear_loaded_sub_device_manager(); void remove_sub_device_manager(MeshSubDeviceManagerId mesh_sub_device_manager_id); diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp index 43ac6aa3574b..ed946f23d9b3 100644 --- a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp +++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp @@ -190,6 +190,26 @@ void py_module(py::module& module) { Args: sub_devices (List[ttnn.SubDevice]): The sub-devices to include in the sub-device manager. + This configuration will be used for each device in the MeshDevice. + local_l1_size (int): The size of the local allocators of each sub-device. The global allocator will be shrunk by this amount. + + Returns: + MeshSubDeviceManagerId: The ID of the created sub-device manager. + )doc") + .def( + "create_sub_device_manager", + [](MeshDevice& self, + const std::vector>& mesh_sub_devices, + DeviceAddr local_l1_size) { return self.create_sub_device_manager(mesh_sub_devices, local_l1_size); }, + py::arg("sub_devices"), + py::arg("local_l1_size"), + R"doc( + Creates a sub-device manager for the given mesh device. + + Args: + mesh_sub_devices (List[List[ttnn.SubDevice]]): The sub-devices to include in the sub-device manager. + Each element of the outer list will be used to configure the corresponding device in the MeshDevice. + This means that the individual devices in the MeshDevice may have different configurations. local_l1_size (int): The size of the local allocators of each sub-device. The global allocator will be shrunk by this amount. Returns: From befa6c8d33f1df92f352d35af229d220b6bd3af3 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Tue, 10 Dec 2024 14:18:28 +0000 Subject: [PATCH 31/59] #0: Fix recursive bug in hash function of global sem, cb --- tt_metal/impl/buffers/global_circular_buffer.cpp | 2 +- tt_metal/impl/buffers/global_semaphore.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tt_metal/impl/buffers/global_circular_buffer.cpp b/tt_metal/impl/buffers/global_circular_buffer.cpp index 2d8760f1af57..df8df656ac36 100644 --- a/tt_metal/impl/buffers/global_circular_buffer.cpp +++ b/tt_metal/impl/buffers/global_circular_buffer.cpp @@ -164,7 +164,7 @@ namespace std { std::size_t hash::operator()( const tt::tt_metal::v1::experimental::GlobalCircularBuffer& global_circular_buffer) const { - return tt::stl::hash::hash_objects_with_default_seed(global_circular_buffer); + return tt::stl::hash::hash_objects_with_default_seed(global_circular_buffer.attribute_values()); } } // namespace std diff --git a/tt_metal/impl/buffers/global_semaphore.cpp b/tt_metal/impl/buffers/global_semaphore.cpp index 64d16beb377d..f080ab23b065 100644 --- a/tt_metal/impl/buffers/global_semaphore.cpp +++ b/tt_metal/impl/buffers/global_semaphore.cpp @@ -82,7 +82,7 @@ namespace std { std::size_t hash::operator()( const tt::tt_metal::GlobalSemaphore& global_semaphore) const { - return tt::stl::hash::hash_objects_with_default_seed(global_semaphore); + return tt::stl::hash::hash_objects_with_default_seed(global_semaphore.attribute_values()); } } // namespace std From 3b12f7633b0c4c23c138f9dc520b6959338f6d9b Mon Sep 17 00:00:00 2001 From: Juan Camilo Vega Date: Tue, 10 Dec 2024 11:30:52 -0500 Subject: [PATCH 32/59] #15075: Adding assert for completely invalid reshapes (#15846) ### Ticket https://github.com/tenstorrent/tt-metal/issues/15075 ### Problem description We add a validation check for reshape when the volumes do not match as the embedding issue preventing this is now solved We deleted the reshape host fallback code that is no longer needed ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12190290996 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --------- Co-authored-by: yugi957 --- .../data_movement/reshape_view/reshape.cpp | 32 +------------------ 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index 8fb90cbc7614..904fdf88a1f1 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -52,36 +52,6 @@ ttnn::Tensor convert_tile_to_rm( (tensor.get_dtype() == DataType::BFLOAT8_B) ? ttnn::typecast(new_tensor, tensor.get_dtype()) : new_tensor; return new_tensor; } -ttnn::Tensor host_reshape(const ttnn::Tensor& tensor, const ttnn::Shape& shape) { - //This function is due to embedding issue 15558, once the issue is fixed we want to delete it - tt::log_warning("host_reshape is deprecated and will be removed in the near future"); - if (!ttnn::has_storage_type_of(tensor, ttnn::StorageType::DEVICE)) { - return tensor.reshape(shape); - } - auto tensor_shape = tensor.shape(); - auto layout = tensor.layout(); - auto device = tensor.device(); - auto memory_config = tensor.memory_config(); - auto host_tensor = tensor.cpu(); - auto rm_tensor = ttnn::to_layout(host_tensor, ttnn::ROW_MAJOR_LAYOUT, std::nullopt, std::nullopt, (Device*)nullptr); - - if (tensor_shape.has_tile_padding()) { - ttnn::Tensor slice_input; - auto host_tensor_4d = unsqueeze_to_4D(rm_tensor); - auto tensor_shape_4d = host_tensor_4d.shape(); - ttnn::SmallVector begins({0, 0, 0, 0}); - ttnn::SmallVector ends( - {tensor_shape_4d[0], tensor_shape_4d[1], tensor_shape_4d[2], tensor_shape_4d[3]}); - ttnn::SmallVector step({1, 1, 1, 1}); - host_tensor_4d = ttnn::slice(host_tensor_4d, begins, ends, step, std::nullopt); - host_tensor = squeeze_from_4D(host_tensor_4d, tensor_shape.rank()); - } - auto host_reshape_tensor = rm_tensor.reshape(shape); - auto final_layout_tensor = - ttnn::to_layout(host_reshape_tensor, layout, std::nullopt, std::nullopt, (Device*)nullptr); - auto device_tensor = ttnn::data_transfer_to_device(final_layout_tensor, device, memory_config); - return device_tensor; -} //Wrapper to turn the ND-> MD problem into 3D->3D for tiled and 2D->2D for Row Major @@ -399,7 +369,7 @@ ttnn::Tensor ReshapeViewOperation::invoke( return tensor.reshape(shape); } //This is a completely incorrect test but it is due to issue 15558 - return detail::host_reshape(tensor, shape); + TT_FATAL(false, "Attempting to reshape between two shapes with different volumes"); } // Catch-all // Do the reshape in row-major From b32ae29f09d9510d48b7ce43d8b152c7d9d9bd10 Mon Sep 17 00:00:00 2001 From: Eyon Land <41128502+eyonland@users.noreply.github.com> Date: Tue, 10 Dec 2024 10:47:56 -0600 Subject: [PATCH 33/59] #0: Updating CB enum to CBIndex for binary_ng eltwise op (#15798) ### Problem description Needed to update the CB enum to use CBIndex ### What's changed CB enums are now using CBIndex ### Checklist - [x] Post commit CI passes (https://github.com/tenstorrent/tt-metal/actions/runs/12207327134) - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../eltwise/binary_ng/device/binary_ng_program_factory.cpp | 6 +++--- .../binary_ng/device/kernels/compute/eltwise_binary.cpp | 6 +++--- .../device/kernels/compute/eltwise_binary_no_bcast.cpp | 6 +++--- .../device/kernels/compute/eltwise_binary_scalar.cpp | 6 +++--- .../kernels/dataflow/reader_interleaved_col_bcast.cpp | 2 +- .../device/kernels/dataflow/reader_interleaved_no_bcast.cpp | 2 +- .../kernels/dataflow/reader_interleaved_row_bcast.cpp | 2 +- .../kernels/dataflow/reader_interleaved_scalar_bcast.cpp | 2 +- .../kernels/dataflow/writer_interleaved_col_bcast.cpp | 4 ++-- .../device/kernels/dataflow/writer_interleaved_no_bcast.cpp | 4 ++-- .../kernels/dataflow/writer_interleaved_row_bcast.cpp | 4 ++-- .../device/kernels/dataflow/writer_interleaved_scalar.cpp | 4 ++-- .../kernels/dataflow/writer_interleaved_scalar_bcast.cpp | 4 ++-- 13 files changed, 26 insertions(+), 26 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp index 6c710e3d193b..bce5cfdc8242 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/binary_ng_program_factory.cpp @@ -302,14 +302,14 @@ BinaryNgDeviceOperation::ProgramFactory::cached_program_t BinaryNgDeviceOperatio // How many tiles to store per input CB (double buffer) constexpr uint32_t num_tiles_per_cb = 2; auto [a_cb, a_cb_handle] = - create_cb(tt::CB::c_in0, program, all_device_cores, a_single_tile_size, num_tiles_per_cb, a_data_format); + create_cb(tt::CBIndex::c_0, program, all_device_cores, a_single_tile_size, num_tiles_per_cb, a_data_format); auto [c_cb, c_cb_handle] = - create_cb(tt::CB::c_out0, program, all_device_cores, c_single_tile_size, num_tiles_per_cb, c_data_format); + create_cb(tt::CBIndex::c_2, program, all_device_cores, c_single_tile_size, num_tiles_per_cb, c_data_format); // If b is a scalar, we only need one tile in the CB uint32_t b_num_tiles_per_cb = b_buffer != nullptr ? num_tiles_per_cb : 1; auto [b_cb, b_cb_handle] = - create_cb(tt::CB::c_in1, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); + create_cb(tt::CBIndex::c_1, program, all_device_cores, b_single_tile_size, b_num_tiles_per_cb, b_data_format); auto a_is_dram = static_cast(a_buffer->buffer_type() == tt_metal::BufferType::DRAM); bool b_is_dram = false; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary.cpp index fdfdd2f650f5..dff89bfd6134 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary.cpp @@ -41,9 +41,9 @@ void MAIN { return; } - constexpr auto cb_in0 = tt::CB::c_in0; - constexpr auto cb_in1 = tt::CB::c_in1; - constexpr auto cb_out0 = tt::CB::c_out0; + constexpr auto cb_in0 = tt::CBIndex::c_0; + constexpr auto cb_in1 = tt::CBIndex::c_1; + constexpr auto cb_out0 = tt::CBIndex::c_2; #if BCAST_INPUT auto cb_bcast = cb_in1; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary_no_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary_no_bcast.cpp index 6275e672c49a..1e3703fa4b80 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary_no_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary_no_bcast.cpp @@ -10,9 +10,9 @@ namespace NAMESPACE { void MAIN { uint32_t num_tiles = get_arg_val(0); - constexpr auto cb_in0 = tt::CB::c_in0; - constexpr auto cb_in1 = tt::CB::c_in1; - constexpr auto cb_out0 = tt::CB::c_out0; + constexpr auto cb_in0 = tt::CBIndex::c_0; + constexpr auto cb_in1 = tt::CBIndex::c_1; + constexpr auto cb_out0 = tt::CBIndex::c_2; binary_op_init_common(cb_in0, cb_in1, cb_out0); add_tiles_init(); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary_scalar.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary_scalar.cpp index 4e57b6aba57c..2b377fb52cab 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary_scalar.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_binary_scalar.cpp @@ -10,9 +10,9 @@ namespace NAMESPACE { void MAIN { uint32_t num_tiles = get_arg_val(0); - constexpr auto cb_in0 = tt::CB::c_in0; - constexpr auto cb_in1 = tt::CB::c_in1; - constexpr auto cb_out0 = tt::CB::c_out0; + constexpr auto cb_in0 = tt::CBIndex::c_0; + constexpr auto cb_in1 = tt::CBIndex::c_1; + constexpr auto cb_out0 = tt::CBIndex::c_2; binary_op_init_common(cb_in0, cb_in1, cb_out0); add_tiles_init(); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_col_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_col_bcast.cpp index aa98c9575d9e..cf0a1e4e13dd 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_col_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_col_bcast.cpp @@ -21,7 +21,7 @@ void kernel_main() { constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; - constexpr auto cb_id_src = tt::CB::c_in0; + constexpr auto cb_id_src = tt::CBIndex::c_0; constexpr uint32_t onetile = 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_no_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_no_bcast.cpp index 36cbaaf3b0b5..2379206448ad 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_no_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_no_bcast.cpp @@ -18,7 +18,7 @@ void kernel_main() { constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; - constexpr auto cb_id_src = tt::CB::c_in0; + constexpr auto cb_id_src = tt::CBIndex::c_0; constexpr uint32_t onetile = 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_row_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_row_bcast.cpp index c5eadbe61d77..7278e6b3510c 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_row_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_row_bcast.cpp @@ -21,7 +21,7 @@ void kernel_main() { constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; - constexpr auto cb_id_src = tt::CB::c_in0; + constexpr auto cb_id_src = tt::CBIndex::c_0; constexpr uint32_t onetile = 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_scalar_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_scalar_bcast.cpp index 6d23f3ac2f23..d1f0922d648a 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_scalar_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/reader_interleaved_scalar_bcast.cpp @@ -20,7 +20,7 @@ void kernel_main() { constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; - constexpr auto cb_id_src = tt::CB::c_in0; + constexpr auto cb_id_src = tt::CBIndex::c_0; constexpr uint32_t onetile = 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_col_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_col_bcast.cpp index 4c8d2337e17a..f17b23684ff2 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_col_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_col_bcast.cpp @@ -22,7 +22,7 @@ void kernel_main() { constexpr uint32_t onetile = 1; - constexpr auto cb_id_src = tt::CB::c_in1; + constexpr auto cb_id_src = tt::CBIndex::c_1; constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); const DataFormat src_data_format = get_dataformat(cb_id_src); @@ -30,7 +30,7 @@ void kernel_main() { const InterleavedAddrGenFast src = { .bank_base_address = src_addr, .page_size = src_tile_bytes, .data_format = src_data_format}; - constexpr auto cb_id_dst = tt::CB::c_out0; + constexpr auto cb_id_dst = tt::CBIndex::c_2; constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; const uint32_t dst_tile_bytes = get_tile_size(cb_id_dst); const DataFormat dst_data_format = get_dataformat(cb_id_dst); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_no_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_no_bcast.cpp index 0ac36437c032..4ef2f809e8ef 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_no_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_no_bcast.cpp @@ -20,7 +20,7 @@ void kernel_main() { constexpr uint32_t onetile = 1; - constexpr auto cb_id_src = tt::CB::c_in1; + constexpr auto cb_id_src = tt::CBIndex::c_1; constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); const DataFormat src_data_format = get_dataformat(cb_id_src); @@ -28,7 +28,7 @@ void kernel_main() { const InterleavedAddrGenFast src = { .bank_base_address = src_addr, .page_size = src_tile_bytes, .data_format = src_data_format}; - constexpr auto cb_id_dst = tt::CB::c_out0; + constexpr auto cb_id_dst = tt::CBIndex::c_2; constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; const uint32_t dst_tile_bytes = get_tile_size(cb_id_dst); const DataFormat dst_data_format = get_dataformat(cb_id_dst); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_row_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_row_bcast.cpp index f02dc62537b8..65ff8e60f69e 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_row_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_row_bcast.cpp @@ -22,7 +22,7 @@ void kernel_main() { constexpr uint32_t onetile = 1; - constexpr auto cb_id_src = tt::CB::c_in1; + constexpr auto cb_id_src = tt::CBIndex::c_1; constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); const DataFormat src_data_format = get_dataformat(cb_id_src); @@ -30,7 +30,7 @@ void kernel_main() { const InterleavedAddrGenFast src = { .bank_base_address = src_addr, .page_size = src_tile_bytes, .data_format = src_data_format}; - constexpr auto cb_id_dst = tt::CB::c_out0; + constexpr auto cb_id_dst = tt::CBIndex::c_2; constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; const uint32_t dst_tile_bytes = get_tile_size(cb_id_dst); const DataFormat dst_data_format = get_dataformat(cb_id_dst); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp index 4aa94dac0ab6..452d1dafaa73 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar.cpp @@ -21,8 +21,8 @@ void kernel_main() { constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; - constexpr auto cb_id_src = tt::CB::c_in1; - constexpr auto cb_id_dst = tt::CB::c_out0; + constexpr auto cb_id_src = tt::CBIndex::c_1; + constexpr auto cb_id_dst = tt::CBIndex::c_2; constexpr uint32_t onetile = 1; const uint32_t dst_tile_bytes = get_tile_size(cb_id_dst); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar_bcast.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar_bcast.cpp index d0e866c7c50e..18915373d25e 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar_bcast.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary_ng/device/kernels/dataflow/writer_interleaved_scalar_bcast.cpp @@ -20,7 +20,7 @@ void kernel_main() { constexpr uint32_t onetile = 1; - constexpr auto cb_id_src = tt::CB::c_in1; + constexpr auto cb_id_src = tt::CBIndex::c_1; constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1; const uint32_t src_tile_bytes = get_tile_size(cb_id_src); const DataFormat src_data_format = get_dataformat(cb_id_src); @@ -28,7 +28,7 @@ void kernel_main() { const InterleavedAddrGenFast src = { .bank_base_address = src_addr, .page_size = src_tile_bytes, .data_format = src_data_format}; - constexpr auto cb_id_dst = tt::CB::c_out0; + constexpr auto cb_id_dst = tt::CBIndex::c_2; constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1; const uint32_t dst_tile_bytes = get_tile_size(cb_id_dst); const DataFormat dst_data_format = get_dataformat(cb_id_dst); From ff50e72dc477ec40352e27e7124dee71f0631539 Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Tue, 10 Dec 2024 13:00:21 -0500 Subject: [PATCH 34/59] #14974: Remove numpy/ directory and the namespace in ttnn (#15852) ### Ticket #14974 ### Problem description Follow up from https://github.com/tenstorrent/tt-metal/pull/15847 - remove usages of `numpy` namespace. ### What's changed * Removed `ttnn::numpy` namespace and the corresponding directory. * Moved `.../operations/numpy/functions.hpp` in `.../operations/`. ### Checklist - [X] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12249474071) - [X] New/Existing tests provide coverage for changes --- .../tt_eager/integration_tests/test_bert.cpp | 46 +++++++------------ tests/tt_eager/ops/test_average_pool.cpp | 4 +- tests/tt_eager/ops/test_bcast_op.cpp | 12 ++--- tests/tt_eager/ops/test_bmm_op.cpp | 4 +- tests/tt_eager/ops/test_eltwise_binary_op.cpp | 10 ++-- tests/tt_eager/ops/test_eltwise_unary_op.cpp | 26 +++++------ tests/tt_eager/ops/test_fold_op.cpp | 4 +- tests/tt_eager/ops/test_layernorm_op.cpp | 4 +- tests/tt_eager/ops/test_pad_op.cpp | 4 +- .../tt_eager/ops/test_sliding_window_ops.cpp | 6 +-- tests/tt_eager/ops/test_softmax_op.cpp | 4 +- tests/tt_eager/ops/test_tensor_utils.cpp | 4 +- tests/tt_eager/ops/test_tilize_op.cpp | 4 +- .../ops/test_tilize_op_channels_last.cpp | 4 +- .../tt_eager/ops/test_tilize_zero_padding.cpp | 4 +- tests/tt_eager/ops/test_transpose_op.cpp | 4 +- .../ops/test_transpose_wh_multi_core.cpp | 4 +- .../ops/test_transpose_wh_single_core.cpp | 4 +- tests/tt_eager/tensors/test_copy_and_move.cpp | 38 +++++++-------- .../tensors/test_host_device_loopback.cpp | 6 +-- tests/tt_eager/tensors/test_ranks.cpp | 16 +++---- .../tensors/test_raw_host_memory_pointer.cpp | 16 +++---- tests/tt_metal/test_utils/comparison.hpp | 4 +- .../gtests/tensor/test_create_tensor.cpp | 2 +- .../tensor/test_create_tensor_with_layout.cpp | 2 +- tests/ttnn/unit_tests/gtests/test_add.cpp | 3 +- .../gtests/test_multi_cq_multi_dev.cpp | 2 +- .../gtests/test_multiprod_queue.cpp | 2 +- tt_metal/common/bfloat16.hpp | 2 +- ttnn/cpp/ttnn/operations/creation.hpp | 2 +- .../reshape_on_device/reshape.cpp | 2 +- .../data_movement/reshape_view/reshape.cpp | 2 +- .../unary/device/unary_composite_op.cpp | 6 +-- .../eltwise/unary_backward/unary_backward.cpp | 2 +- .../experimental/reduction/argmax/argmax.cpp | 12 ++--- .../ttnn/operations/{numpy => }/functions.hpp | 6 +-- ...ple_bilinear_program_factory_multicore.cpp | 2 +- .../reduction/prod/device/prod_op_all.cpp | 6 +-- .../ttnn/operations/reduction/prod/prod.cpp | 2 +- 39 files changed, 134 insertions(+), 153 deletions(-) rename ttnn/cpp/ttnn/operations/{numpy => }/functions.hpp (99%) diff --git a/tests/tt_eager/integration_tests/test_bert.cpp b/tests/tt_eager/integration_tests/test_bert.cpp index 81e51414ebe0..6140860666a6 100644 --- a/tests/tt_eager/integration_tests/test_bert.cpp +++ b/tests/tt_eager/integration_tests/test_bert.cpp @@ -10,7 +10,7 @@ #include "ttnn/operations/normalization/softmax/softmax.hpp" #include "tt_metal/common/constants.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/operations/matmul/matmul.hpp" #include "ttnn/operations/normalization/layernorm/layernorm.hpp" #include "ttnn/operations/eltwise/binary/binary.hpp" @@ -228,7 +228,7 @@ void test_bert() { std::uint32_t intermediate_size = hidden_size * 4; auto attention_mask = - ttnn::numpy::random::uniform( + ttnn::random::uniform( bfloat16(-1.0f), bfloat16(1.0f), {batch_size, 1, TILE_HEIGHT, sequence_size}, Layout::TILE) .to(device, l1_memory_config); @@ -236,73 +236,61 @@ void test_bert() { for (auto encoder_index = 0; encoder_index < num_encoders; encoder_index++) { parameters.emplace( fmt::format("fused_qkv_weight_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, hidden_size, hidden_size * 3}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, hidden_size, hidden_size * 3}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( fmt::format("fused_qkv_bias_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, hidden_size * 3}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, hidden_size * 3}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( fmt::format("selfout_weight_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, hidden_size, hidden_size}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, hidden_size, hidden_size}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( fmt::format("selfout_bias_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, hidden_size}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, hidden_size}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( fmt::format("attention_layernorm_weight_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::ROW_MAJOR) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::ROW_MAJOR) .to(device, dram_memory_config)); parameters.emplace( fmt::format("attention_layernorm_bias_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::ROW_MAJOR) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::ROW_MAJOR) .to(device, dram_memory_config)); parameters.emplace( fmt::format("ff1_weight_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, hidden_size, intermediate_size}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, hidden_size, intermediate_size}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( fmt::format("ff1_bias_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, intermediate_size}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, intermediate_size}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( fmt::format("ff2_weight_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, intermediate_size, hidden_size}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, intermediate_size, hidden_size}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( fmt::format("ff2_bias_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, hidden_size}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, hidden_size}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( fmt::format("feedforward_layernorm_weight_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::ROW_MAJOR) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::ROW_MAJOR) .to(device, dram_memory_config)); parameters.emplace( fmt::format("feedforward_layernorm_bias_{}", encoder_index), - ttnn::numpy::random::uniform( - bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::ROW_MAJOR) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::ROW_MAJOR) .to(device, dram_memory_config)); }; parameters.emplace( "qa_head_weight", - ttnn::numpy::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, hidden_size, TILE_WIDTH}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, hidden_size, TILE_WIDTH}, Layout::TILE) .to(device, dram_memory_config)); parameters.emplace( "qa_head_bias", ttnn::reshape( - ttnn::numpy::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::TILE) + ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), {1, 1, TILE_HEIGHT, TILE_WIDTH}, Layout::TILE) .to(device, dram_memory_config), ttnn::Shape{tt::tt_metal::LegacyShape{{1, 1, 1, TILE_WIDTH}, {1, 1, TILE_HEIGHT, TILE_WIDTH}}})); @@ -310,7 +298,7 @@ void test_bert() { tt::log_debug(tt::LogTest, "run_bert started"); auto begin = std::chrono::steady_clock::now(); auto hidden_states = - ttnn::numpy::random::uniform( + ttnn::random::uniform( bfloat16(-1.0f), bfloat16(1.0f), {batch_size, 1, sequence_size, hidden_size}, Layout::TILE) .to(device, l1_memory_config); for (auto encoder_index = 0; encoder_index < num_encoders; encoder_index++) { diff --git a/tests/tt_eager/ops/test_average_pool.cpp b/tests/tt_eager/ops/test_average_pool.cpp index b30f22a898d2..76a24f50e04d 100644 --- a/tests/tt_eager/ops/test_average_pool.cpp +++ b/tests/tt_eager/ops/test_average_pool.cpp @@ -4,7 +4,7 @@ #include "ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp" #include "ttnn/operations/experimental/auto_format/auto_format.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/tensor/tensor.hpp" #include "common/constants.hpp" @@ -17,7 +17,7 @@ using tt::tt_metal::Tensor; Tensor run_avg_pool_2d_resnet(tt::tt_metal::LegacyShape& tensor_shape, Device* device) { using ttnn::operations::experimental::auto_format::AutoFormat; - auto input_tensor = ttnn::numpy::random::random(tensor_shape, DataType::BFLOAT16); + auto input_tensor = ttnn::random::random(tensor_shape, DataType::BFLOAT16); auto padded_input_shape = AutoFormat::pad_to_tile_shape(tensor_shape, false, false); Tensor padded_input_tensor = input_tensor; if (!AutoFormat::check_input_tensor_format(input_tensor, padded_input_shape)) { diff --git a/tests/tt_eager/ops/test_bcast_op.cpp b/tests/tt_eager/ops/test_bcast_op.cpp index 05be3303c062..38675a937239 100644 --- a/tests/tt_eager/ops/test_bcast_op.cpp +++ b/tests/tt_eager/ops/test_bcast_op.cpp @@ -8,7 +8,7 @@ #include "ttnn/operations/data_movement/bcast/bcast.hpp" #include "common/constants.hpp" #include -#include +#include using namespace tt; using namespace tt_metal; @@ -49,7 +49,7 @@ int main(int argc, char** argv) { throw std::runtime_error("Unsupported Dim!"); } - Tensor a = ttnn::numpy::random::random(input_shape_a).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random(input_shape_a).to(Layout::TILE).to(device); Tensor b = ttnn::zeros( ttnn::Shape({1, 1, TILE_HEIGHT, TILE_WIDTH}), DataType::BFLOAT16, Layout::TILE, *device); @@ -67,28 +67,28 @@ int main(int argc, char** argv) { } { - Tensor a = ttnn::numpy::random::random({1, 1, 32, 4544}).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random({1, 1, 32, 4544}).to(Layout::TILE).to(device); Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 4544}), DataType::BFLOAT16, Layout::TILE, *device); Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::H); Tensor d = c.cpu(); } { - Tensor a = ttnn::numpy::random::random({1, 1, 32, 4544}).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random({1, 1, 32, 4544}).to(Layout::TILE).to(device); Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 4544}), DataType::BFLOAT16, Layout::TILE, *device); Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::ADD, ttnn::BcastOpDim::H); Tensor d = c.cpu(); } { - Tensor a = ttnn::numpy::random::random({1, 71, 32, 32}).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random({1, 71, 32, 32}).to(Layout::TILE).to(device); Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 32}), DataType::BFLOAT16, Layout::TILE, *device); Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::HW); Tensor d = c.cpu(); } { - Tensor a = ttnn::numpy::random::random({1, 71, 32, 64}).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random({1, 71, 32, 64}).to(Layout::TILE).to(device); Tensor b = ttnn::zeros(ttnn::Shape({1, 1, 32, 32}), DataType::BFLOAT16, Layout::TILE, *device); Tensor c = ttnn::bcast(0, a, b, ttnn::BcastOpMath::MUL, ttnn::BcastOpDim::HW); Tensor d = c.cpu(); diff --git a/tests/tt_eager/ops/test_bmm_op.cpp b/tests/tt_eager/ops/test_bmm_op.cpp index f769870b595f..a247e375718c 100644 --- a/tests/tt_eager/ops/test_bmm_op.cpp +++ b/tests/tt_eager/ops/test_bmm_op.cpp @@ -8,7 +8,7 @@ #include "ttnn/tensor/types.hpp" #include "ttnn/operations/matmul/device/matmul_op.hpp" #include "common/constants.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using namespace tt; using namespace tt_metal; @@ -40,7 +40,7 @@ int main(int argc, char** argv) { ttnn::Shape shapeb1({1, 1, Kt * TILE_HEIGHT, Nt * TILE_WIDTH}); // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::random::random(shapea.value).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random(shapea.value).to(Layout::TILE).to(device); Tensor b = ttnn::zeros(shapeb, DataType::BFLOAT16, Layout::TILE, *device); Tensor b1 = ttnn::zeros(shapeb1, DataType::BFLOAT16, Layout::TILE, *device); diff --git a/tests/tt_eager/ops/test_eltwise_binary_op.cpp b/tests/tt_eager/ops/test_eltwise_binary_op.cpp index 52a8955c859e..533ad2bed872 100644 --- a/tests/tt_eager/ops/test_eltwise_binary_op.cpp +++ b/tests/tt_eager/ops/test_eltwise_binary_op.cpp @@ -7,7 +7,7 @@ #include "ttnn/tensor/host_buffer/types.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/eltwise/binary/binary.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using tt::tt_metal::DataType; using tt::tt_metal::Device; @@ -37,8 +37,8 @@ Tensor host_function(const Tensor& input_tensor_a, const Tensor& input_tensor_b) template bool run_test( const tt::tt_metal::LegacyShape& shape, const DeviceFunction& device_function, Device* device, Args... args) { - auto input_tensor_a = ttnn::numpy::random::random(shape, DataType::BFLOAT16); - auto input_tensor_b = ttnn::numpy::random::random(shape, DataType::BFLOAT16); + auto input_tensor_a = ttnn::random::random(shape, DataType::BFLOAT16); + auto input_tensor_b = ttnn::random::random(shape, DataType::BFLOAT16); auto host_output = HostFunction(input_tensor_a, input_tensor_b); auto device_output = @@ -46,7 +46,7 @@ bool run_test( .cpu() .to(Layout::ROW_MAJOR); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } int main() { @@ -114,7 +114,7 @@ int main() { // Allocate a tensor to show that the addresses aren't cached auto input_tensor = - ttnn::numpy::random::uniform(bfloat16(0.0f), bfloat16(0.0f), {1, 1, 32, 32}).to(Layout::TILE).to(device); + ttnn::random::uniform(bfloat16(0.0f), bfloat16(0.0f), {1, 1, 32, 32}).to(Layout::TILE).to(device); run_binary_ops(); diff --git a/tests/tt_eager/ops/test_eltwise_unary_op.cpp b/tests/tt_eager/ops/test_eltwise_unary_op.cpp index 05efa664dcf7..5ca90b2f8962 100644 --- a/tests/tt_eager/ops/test_eltwise_unary_op.cpp +++ b/tests/tt_eager/ops/test_eltwise_unary_op.cpp @@ -13,7 +13,7 @@ #include "ttnn/operations/data_movement/pad/pad.hpp" #include "ttnn/operation.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using tt::tt_metal::DataType; using tt::tt_metal::Device; @@ -58,7 +58,7 @@ Tensor host_function(const Tensor& input_tensor) { template bool run_test(Device* device, const tt::tt_metal::LegacyShape& shape, float low, float high, Args... args) { - auto input_tensor = ttnn::numpy::random::uniform(bfloat16(low), bfloat16(high), shape).to(Layout::TILE); + auto input_tensor = ttnn::random::uniform(bfloat16(low), bfloat16(high), shape).to(Layout::TILE); using ttnn::operations::unary::UnaryOpType; using ttnn::operations::unary::UnaryWithParam; @@ -66,35 +66,35 @@ bool run_test(Device* device, const tt::tt_metal::LegacyShape& shape, float low, if constexpr (unary_op_type == UnaryOpType::SQRT) { auto host_output = host_function<::detail::sqrt>(input_tensor); auto device_output = ttnn::sqrt(input_tensor.to(device)).cpu(); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } else if constexpr (unary_op_type == UnaryOpType::EXP) { auto host_output = host_function<::detail::exp>(input_tensor); auto device_output = ttnn::exp(input_tensor.to(device)).cpu(); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } else if constexpr (unary_op_type == UnaryOpType::RECIP) { auto host_output = host_function<::detail::recip>(input_tensor); auto device_output = ttnn::reciprocal(input_tensor.to(device)).cpu(); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } else if constexpr (unary_op_type == UnaryOpType::GELU) { auto host_output = host_function<::detail::gelu>(input_tensor); auto device_output = ttnn::gelu(input_tensor.to(device)).cpu(); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } else if constexpr (unary_op_type == UnaryOpType::RELU) { auto host_output = host_function<::detail::relu>(input_tensor); auto device_output = ttnn::relu(input_tensor.to(device)).cpu(); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } else if constexpr (unary_op_type == UnaryOpType::SIGMOID) { auto host_output = host_function<::detail::sigmoid>(input_tensor); auto device_output = ttnn::sigmoid(input_tensor.to(device)).cpu(); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } else if constexpr (unary_op_type == UnaryOpType::LOG) { auto host_output = host_function<::detail::log>(input_tensor); auto device_output = ttnn::log(input_tensor.to(device)).cpu(); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } else if constexpr (unary_op_type == UnaryOpType::TANH) { auto host_output = host_function<::detail::tanh>(input_tensor); auto device_output = ttnn::tanh(input_tensor.to(device)).cpu(); - return ttnn::numpy::allclose(host_output, device_output, args...); + return ttnn::allclose(host_output, device_output, args...); } TT_ASSERT(false, "Unsupported function"); return false; @@ -111,7 +111,7 @@ void test_operation_infrastructure() { auto device = tt::tt_metal::CreateDevice(device_id); auto shape = tt::tt_metal::LegacyShape{1, 1, TILE_HEIGHT, TILE_WIDTH}; - auto input_tensor = ttnn::numpy::random::uniform(bfloat16(0), bfloat16(1), shape).to(Layout::TILE).to(device); + auto input_tensor = ttnn::random::uniform(bfloat16(0), bfloat16(1), shape).to(Layout::TILE).to(device); ttnn::operations::unary::operation_attributes_t op_args{ {UnaryWithParam{UnaryOpType::SQRT}}, @@ -139,7 +139,7 @@ void test_shape_padding() { tt::tt_metal::Array4D input_shape = {1, 1, 13, 18}; tt::tt_metal::Array4D padded_input_shape = {1, 1, TILE_HEIGHT, TILE_WIDTH}; - auto input_tensor = ttnn::numpy::random::uniform(bfloat16(0), bfloat16(1), input_shape); + auto input_tensor = ttnn::random::uniform(bfloat16(0), bfloat16(1), input_shape); auto padded_input_tensor = ttnn::pad(input_tensor, padded_input_shape, tt::tt_metal::Array4D({0, 0, 0, 0}), 0); @@ -251,7 +251,7 @@ void test_program_cache() { // Allocate a tensor to show that the addresses aren't cached auto input_tensor = - ttnn::numpy::random::uniform(bfloat16(0.0f), bfloat16(0.0f), {1, 1, 32, 32}).to(Layout::TILE).to(device); + ttnn::random::uniform(bfloat16(0.0f), bfloat16(0.0f), {1, 1, 32, 32}).to(Layout::TILE).to(device); // Program Cache Hit run_test(device, {1, 1, TILE_HEIGHT, TILE_WIDTH}, 0.0f, 1.0f, 1e-1f, 1e-5f); diff --git a/tests/tt_eager/ops/test_fold_op.cpp b/tests/tt_eager/ops/test_fold_op.cpp index 5baeb28532ed..b5061eed2f3c 100644 --- a/tests/tt_eager/ops/test_fold_op.cpp +++ b/tests/tt_eager/ops/test_fold_op.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/data_movement/fold/fold.hpp" @@ -16,7 +16,7 @@ using namespace tt::tt_metal; using namespace constants; void run_fold(Device* device, tt::tt_metal::LegacyShape shape) { - Tensor input_tensor = ttnn::numpy::random::random(shape).to(Layout::ROW_MAJOR).to(device); + Tensor input_tensor = ttnn::random::random(shape).to(Layout::ROW_MAJOR).to(device); uint32_t stride_h = 2; uint32_t stride_w = 2; uint8_t queue_id = 0; diff --git a/tests/tt_eager/ops/test_layernorm_op.cpp b/tests/tt_eager/ops/test_layernorm_op.cpp index c1cc3cda8e5b..563a5b101e7d 100644 --- a/tests/tt_eager/ops/test_layernorm_op.cpp +++ b/tests/tt_eager/ops/test_layernorm_op.cpp @@ -5,7 +5,7 @@ #include "tt_metal/host_api.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/normalization/layernorm/layernorm.hpp" -#include +#include #include #include @@ -29,7 +29,7 @@ int main(int argc, char** argv) { int device_id = 0; tt_metal::Device* device = tt_metal::CreateDevice(device_id); tt::tt_metal::LegacyShape shape = {1, 1, TILE_HEIGHT, TILE_WIDTH}; - Tensor a = ttnn::numpy::random::random(shape).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random(shape).to(Layout::TILE).to(device); Tensor c = ttnn::layer_norm(a, 1e-4f); Tensor d = c.cpu(); Tensor host_a = a.cpu(); // Move tensor a to host to validate diff --git a/tests/tt_eager/ops/test_pad_op.cpp b/tests/tt_eager/ops/test_pad_op.cpp index d15662298ad4..7be4bce13144 100644 --- a/tests/tt_eager/ops/test_pad_op.cpp +++ b/tests/tt_eager/ops/test_pad_op.cpp @@ -10,7 +10,7 @@ #include "ttnn/operation.hpp" #include "ttnn/operations/data_movement/pad/pad.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using tt::tt_metal::DataType; using tt::tt_metal::Device; @@ -28,7 +28,7 @@ void test_operation_infrastructure() { tt::tt_metal::Array4D input_shape = {1, 1, 18, 13}; tt::tt_metal::Array4D padded_shape = {1, 1, TILE_HEIGHT, TILE_WIDTH}; - auto input_tensor = ttnn::numpy::random::uniform(bfloat16(0), bfloat16(1), input_shape); + auto input_tensor = ttnn::random::uniform(bfloat16(0), bfloat16(1), input_shape); auto output_tensor = ttnn::pad(input_tensor, padded_shape, tt::tt_metal::Array4D({0, 0, 0, 0}), 0); TT_FATAL(output_tensor.get_padded_shape() == padded_shape, "Error"); diff --git a/tests/tt_eager/ops/test_sliding_window_ops.cpp b/tests/tt_eager/ops/test_sliding_window_ops.cpp index 7afbd3c7f891..6b975ede57ba 100644 --- a/tests/tt_eager/ops/test_sliding_window_ops.cpp +++ b/tests/tt_eager/ops/test_sliding_window_ops.cpp @@ -11,7 +11,7 @@ #include "ttnn/operations/sliding_window/reference_sliding_window.hpp" #include "ttnn/tensor/tensor.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/tensor/types.hpp" using std::vector; @@ -382,9 +382,9 @@ int main() { tt::tt_metal::LegacyShape filter_tensor_shape = {config.window_hw.first, config.window_hw.second}; Tensor input_padded_tensor = - ttnn::numpy::random::random(input_tensor_shape, DataType::BFLOAT16).to(Layout::ROW_MAJOR).cpu(); + ttnn::random::random(input_tensor_shape, DataType::BFLOAT16).to(Layout::ROW_MAJOR).cpu(); Tensor filter_tensor = - ttnn::numpy::random::random(filter_tensor_shape, DataType::BFLOAT16).to(Layout::ROW_MAJOR).cpu(); + ttnn::random::random(filter_tensor_shape, DataType::BFLOAT16).to(Layout::ROW_MAJOR).cpu(); auto input_padded_tensor_buf = owned_buffer::get_as(input_padded_tensor); auto filter_tensor_buf = owned_buffer::get_as(filter_tensor); diff --git a/tests/tt_eager/ops/test_softmax_op.cpp b/tests/tt_eager/ops/test_softmax_op.cpp index 4a7ae198b063..7233bc397fc2 100644 --- a/tests/tt_eager/ops/test_softmax_op.cpp +++ b/tests/tt_eager/ops/test_softmax_op.cpp @@ -5,7 +5,7 @@ #include "tt_metal/host_api.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/normalization/softmax/softmax.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include #include @@ -16,7 +16,7 @@ using namespace tt::tt_metal; using namespace constants; void run_softmax(Device* device, tt::tt_metal::LegacyShape shape) { - Tensor input_tensor = ttnn::numpy::random::random(shape).to(Layout::TILE).to(device); + Tensor input_tensor = ttnn::random::random(shape).to(Layout::TILE).to(device); Tensor device_output_tensor = ttnn::softmax_in_place(input_tensor); Tensor output_tensor = device_output_tensor.cpu(); } diff --git a/tests/tt_eager/ops/test_tensor_utils.cpp b/tests/tt_eager/ops/test_tensor_utils.cpp index 0b6c2e3d376a..9a9c1ae3de25 100644 --- a/tests/tt_eager/ops/test_tensor_utils.cpp +++ b/tests/tt_eager/ops/test_tensor_utils.cpp @@ -12,6 +12,7 @@ #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/creation.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/tensor/types.hpp" #include "ttnn/tensor/tensor_utils.hpp" @@ -462,8 +463,7 @@ static void test_convert_conv_weight_tensor_to_tiled_layout_block_sharded() { static void test_convert_conv_bias_tensor_to_tiled_layout_block_sharded() { tt::log_info(tt::LogTest, "Running {}", __func__); for (auto i = 0; i < bias_tensor_shape.size(); i++) { - auto input_tensor = - ttnn::numpy::random::random(bias_tensor_shape[i], DataType::BFLOAT16).to(Layout::ROW_MAJOR).cpu(); + auto input_tensor = ttnn::random::random(bias_tensor_shape[i], DataType::BFLOAT16).to(Layout::ROW_MAJOR).cpu(); auto input_buffer = owned_buffer::get_as(input_tensor); auto output_tensor = convert_conv_bias_tensor_to_tiled_layout_block_sharded(input_tensor, shards[i], DataType::BFLOAT16); diff --git a/tests/tt_eager/ops/test_tilize_op.cpp b/tests/tt_eager/ops/test_tilize_op.cpp index c78d37487c88..49531d54a37f 100644 --- a/tests/tt_eager/ops/test_tilize_op.cpp +++ b/tests/tt_eager/ops/test_tilize_op.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "common/constants.hpp" #include "ttnn/tensor/host_buffer/functions.hpp" @@ -37,7 +37,7 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// tt::tt_metal::LegacyShape shape = {1, 64, 32, 64}; // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::random::random(shape).to(device); + Tensor a = ttnn::random::random(shape).to(device); Tensor b = ttnn::tilize(a); Tensor c = b.cpu(); diff --git a/tests/tt_eager/ops/test_tilize_op_channels_last.cpp b/tests/tt_eager/ops/test_tilize_op_channels_last.cpp index 47dba7466530..502bc2b696b1 100644 --- a/tests/tt_eager/ops/test_tilize_op_channels_last.cpp +++ b/tests/tt_eager/ops/test_tilize_op_channels_last.cpp @@ -12,7 +12,7 @@ #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/data_movement/tilize/tilize.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using namespace tt; using namespace tt_metal; @@ -37,7 +37,7 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// tt::tt_metal::LegacyShape shape = {1, 32, 32, 64}; // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::random::random(shape).to(device); + Tensor a = ttnn::random::random(shape).to(device); Tensor b = ttnn::tilize(a); Tensor c = b.cpu(); //////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_eager/ops/test_tilize_zero_padding.cpp b/tests/tt_eager/ops/test_tilize_zero_padding.cpp index 178948d3c189..580bd4102955 100644 --- a/tests/tt_eager/ops/test_tilize_zero_padding.cpp +++ b/tests/tt_eager/ops/test_tilize_zero_padding.cpp @@ -12,7 +12,7 @@ #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using namespace tt; using namespace tt_metal; @@ -37,7 +37,7 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// tt::tt_metal::LegacyShape shape = {1, 32, 45, 64}; // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::random::random(shape).to(device); + Tensor a = ttnn::random::random(shape).to(device); Tensor b = ttnn::tilize_with_zero_padding(a); Tensor c = b.cpu(); //////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_eager/ops/test_transpose_op.cpp b/tests/tt_eager/ops/test_transpose_op.cpp index bdb4e64273ee..723950036572 100644 --- a/tests/tt_eager/ops/test_transpose_op.cpp +++ b/tests/tt_eager/ops/test_transpose_op.cpp @@ -5,7 +5,7 @@ #include "tt_metal/host_api.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/operations/data_movement/transpose/transpose.hpp" -#include +#include #include #include @@ -33,7 +33,7 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// tt::tt_metal::LegacyShape shape = {1, 1, TILE_HEIGHT, TILE_WIDTH}; // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::random::random(shape).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random(shape).to(Layout::TILE).to(device); tt_metal::Tensor c = ttnn::transpose(a, -2, -1); diff --git a/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp b/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp index 3aba3aaef1e8..f4957b4b2038 100644 --- a/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp +++ b/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "ttnn/tensor/host_buffer/functions.hpp" #include "ttnn/tensor/host_buffer/types.hpp" @@ -82,7 +82,7 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// tt::tt_metal::LegacyShape shape = {1, 1, 10 * TILE_HEIGHT, 12 * TILE_WIDTH}; // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::random::random(shape).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random(shape).to(Layout::TILE).to(device); tt_metal::Tensor c = ttnn::transpose(a, -2, -1); diff --git a/tests/tt_eager/ops/test_transpose_wh_single_core.cpp b/tests/tt_eager/ops/test_transpose_wh_single_core.cpp index 3aba3aaef1e8..f4957b4b2038 100644 --- a/tests/tt_eager/ops/test_transpose_wh_single_core.cpp +++ b/tests/tt_eager/ops/test_transpose_wh_single_core.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include "ttnn/tensor/host_buffer/functions.hpp" #include "ttnn/tensor/host_buffer/types.hpp" @@ -82,7 +82,7 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// tt::tt_metal::LegacyShape shape = {1, 1, 10 * TILE_HEIGHT, 12 * TILE_WIDTH}; // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::numpy::random::random(shape).to(Layout::TILE).to(device); + Tensor a = ttnn::random::random(shape).to(Layout::TILE).to(device); tt_metal::Tensor c = ttnn::transpose(a, -2, -1); diff --git a/tests/tt_eager/tensors/test_copy_and_move.cpp b/tests/tt_eager/tensors/test_copy_and_move.cpp index 656585f33519..5fb62254db1a 100644 --- a/tests/tt_eager/tensors/test_copy_and_move.cpp +++ b/tests/tt_eager/tensors/test_copy_and_move.cpp @@ -10,7 +10,7 @@ #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor_impl.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using namespace tt; using namespace tt_metal; @@ -21,14 +21,14 @@ bool test_tensor_copy_semantics(Device* device) { tt::tt_metal::LegacyShape single_tile_shape = {1, 1, TILE_HEIGHT, TILE_WIDTH}; // host tensor to host tensor copy constructor - Tensor host_a = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE); + Tensor host_a = ttnn::random::random(single_tile_shape).to(Layout::TILE); Tensor host_a_copy = host_a; auto host_a_data = owned_buffer::get_as(host_a); auto host_a_copy_data = owned_buffer::get_as(host_a_copy); pass &= host_a_data == host_a_copy_data; // dev tensor to dev tensor copy constructor - Tensor dev_a = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device); + Tensor dev_a = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device); Tensor dev_a_copy = dev_a; auto dev_a_on_host = dev_a.cpu(); auto dev_a_copy_on_host = dev_a_copy.cpu(); @@ -40,14 +40,14 @@ bool test_tensor_copy_semantics(Device* device) { Tensor host_c = ttnn::arange(/*start=*/0, /*stop=*/tt_metal::compute_volume(single_tile_shape), /*step=*/1) .reshape(single_tile_shape) .to(Layout::TILE); - Tensor host_c_copy = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE); + Tensor host_c_copy = ttnn::random::random(single_tile_shape).to(Layout::TILE); host_c_copy = host_c; auto host_c_data = owned_buffer::get_as(host_c); auto host_c_copy_data = owned_buffer::get_as(host_c_copy); pass &= host_c_data == host_c_copy_data; // host tensor updated with dev tensor copy assignment - Tensor host_d_copy = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE); + Tensor host_d_copy = ttnn::random::random(single_tile_shape).to(Layout::TILE); host_d_copy = dev_a; pass &= (host_d_copy.storage_type() == StorageType::DEVICE); auto host_d_copy_on_host = host_d_copy.cpu(); @@ -56,7 +56,7 @@ bool test_tensor_copy_semantics(Device* device) { // dev tensor updated with host tensor copy assignment Tensor host_e = ttnn::ones(single_tile_shape, DataType::BFLOAT16, Layout::TILE); - Tensor dev_e_copy = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device); + Tensor dev_e_copy = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device); dev_e_copy = host_e; pass &= (dev_e_copy.storage_type() == StorageType::OWNED); auto host_e_data = owned_buffer::get_as(host_e); @@ -81,7 +81,7 @@ bool test_tensor_move_semantics(Device* device) { bool pass = true; tt::tt_metal::LegacyShape single_tile_shape = {1, 1, TILE_HEIGHT, TILE_WIDTH}; - auto random_tensor = ttnn::numpy::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), single_tile_shape); + auto random_tensor = ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), single_tile_shape); auto bfloat_data = owned_buffer::get_as(random_tensor); // host tensor to host tensor move constructor @@ -100,7 +100,7 @@ bool test_tensor_move_semantics(Device* device) { pass &= dev_a_copy_data == bfloat_data; // host tensor updated with host tensor move assignment - auto random_tensor_three = ttnn::numpy::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), single_tile_shape); + auto random_tensor_three = ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), single_tile_shape); auto bfloat_data_three = owned_buffer::get_as(random_tensor_three); Tensor host_c = Tensor(OwnedStorage{bfloat_data_three}, single_tile_shape, DataType::BFLOAT16, Layout::TILE); Tensor host_c_copy = Tensor(dev_a_copy_on_host.get_storage(), single_tile_shape, DataType::BFLOAT16, Layout::TILE); @@ -117,7 +117,7 @@ bool test_tensor_move_semantics(Device* device) { pass &= host_d_copy_data == bfloat_data; // dev tensor updated with host tensor copy assignment - auto random_tensor_four = ttnn::numpy::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), single_tile_shape); + auto random_tensor_four = ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), single_tile_shape); auto bfloat_data_four = owned_buffer::get_as(random_tensor_four); Tensor host_e = Tensor(random_tensor_four.get_storage(), single_tile_shape, DataType::BFLOAT16, Layout::TILE); Tensor dev_e_copy = @@ -128,7 +128,7 @@ bool test_tensor_move_semantics(Device* device) { pass &= dev_e_copy_data == bfloat_data_four; // dev tensor updated with dev tensor copy assignment - auto random_tensor_five = ttnn::numpy::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), single_tile_shape); + auto random_tensor_five = ttnn::random::uniform(bfloat16(-1.0f), bfloat16(1.0f), single_tile_shape); auto bfloat_data_five = owned_buffer::get_as(random_tensor_five); Tensor dev_b = Tensor(random_tensor_four.get_storage(), single_tile_shape, DataType::BFLOAT16, Layout::TILE).to(device); @@ -153,32 +153,32 @@ bool test_tensor_deallocate_semantics(Device* device) { MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::L1}; // dev tensor allocate, deallocate, reallocate same address DRAM - Tensor dev_a = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); + Tensor dev_a = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); uint32_t address_a = dev_a.buffer()->address(); dev_a.deallocate(); - Tensor dev_b = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); + Tensor dev_b = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); uint32_t address_b = dev_b.buffer()->address(); pass &= address_a == address_b; // dev tensor allocate, allocate, deallocate, reallocate same address DRAM - Tensor dev_c = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); + Tensor dev_c = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); dev_b.deallocate(); - Tensor dev_d = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); + Tensor dev_d = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); uint32_t address_d = dev_d.buffer()->address(); pass &= address_b == address_d; // dev tensor allocate, deallocate, reallocate same address L1 - Tensor dev_e = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, l1_mem_config); + Tensor dev_e = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, l1_mem_config); uint32_t address_e = dev_e.buffer()->address(); dev_e.deallocate(); - Tensor dev_f = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, l1_mem_config); + Tensor dev_f = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, l1_mem_config); uint32_t address_f = dev_f.buffer()->address(); pass &= address_e == address_f; // dev tensor allocate, allocate, deallocate, reallocate same address DRAM - Tensor dev_g = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, l1_mem_config); + Tensor dev_g = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, l1_mem_config); dev_f.deallocate(); - Tensor dev_h = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, l1_mem_config); + Tensor dev_h = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, l1_mem_config); uint32_t address_h = dev_h.buffer()->address(); pass &= address_f == address_h; @@ -195,7 +195,7 @@ bool test_tensor_deallocate_and_close_device(Device* device) { MemoryConfig{.memory_layout = TensorMemoryLayout::INTERLEAVED, .buffer_type = BufferType::L1}; // dev tensor allocate, deallocate, reallocate same address DRAM - Tensor dev_a = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); + Tensor dev_a = ttnn::random::random(single_tile_shape).to(Layout::TILE).to(device, dram_mem_config); uint32_t address_a = dev_a.buffer()->address(); pass &= tt_metal::CloseDevice(device); dev_a.deallocate(); diff --git a/tests/tt_eager/tensors/test_host_device_loopback.cpp b/tests/tt_eager/tensors/test_host_device_loopback.cpp index a49871a4d393..f5680c980ca6 100644 --- a/tests/tt_eager/tensors/test_host_device_loopback.cpp +++ b/tests/tt_eager/tensors/test_host_device_loopback.cpp @@ -11,7 +11,7 @@ #include "ttnn/tensor/host_buffer/types.hpp" #include "ttnn/tensor/tensor.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using namespace tt; using namespace tt_metal; @@ -21,7 +21,7 @@ bool test_single_tile_single_dram_bank_loopback(Device* device) { bool pass = true; tt::tt_metal::LegacyShape single_tile_shape = {1, 1, TILE_HEIGHT, TILE_WIDTH}; - Tensor host_a = ttnn::numpy::random::random(single_tile_shape).to(Layout::TILE); + Tensor host_a = ttnn::random::random(single_tile_shape).to(Layout::TILE); Tensor device_a = host_a.to(device); Tensor loopbacked_a = device_a.cpu(); auto host_a_data = owned_buffer::get_as(host_a); @@ -35,7 +35,7 @@ bool test_multi_tile_multi_dram_bank_loopback(Device* device) { bool pass = true; tt::tt_metal::LegacyShape multi_tile_shape = {1, 1, 4 * TILE_HEIGHT, 3 * TILE_WIDTH}; - Tensor host_a = ttnn::numpy::random::random(multi_tile_shape).to(Layout::TILE); + Tensor host_a = ttnn::random::random(multi_tile_shape).to(Layout::TILE); Tensor device_a = host_a.to(device); Tensor loopbacked_a = device_a.cpu(); auto host_a_data = owned_buffer::get_as(host_a); diff --git a/tests/tt_eager/tensors/test_ranks.cpp b/tests/tt_eager/tensors/test_ranks.cpp index e01b9bb2261d..037a823de23b 100644 --- a/tests/tt_eager/tensors/test_ranks.cpp +++ b/tests/tt_eager/tensors/test_ranks.cpp @@ -13,7 +13,7 @@ #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor_impl.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" using namespace tt; using namespace tt_metal; @@ -23,7 +23,7 @@ bool test_2d_tensor(Device* device) { bool pass = true; Shape shape = {30, 30}; - Tensor tensor = ttnn::numpy::random::random(shape); + Tensor tensor = ttnn::random::random(shape); tensor = tensor.pad_to_tile(0.0f); tensor = tensor.to(Layout::TILE); tensor = tensor.to(device); @@ -36,7 +36,7 @@ bool test_3d_tensor(Device* device) { bool pass = true; Shape shape = {3, 30, 30}; - Tensor tensor = ttnn::numpy::random::random(shape); + Tensor tensor = ttnn::random::random(shape); tensor = tensor.pad_to_tile(0.0f); tensor = tensor.to(Layout::TILE); tensor = tensor.to(device); @@ -49,7 +49,7 @@ bool test_4d_tensor(Device* device) { bool pass = true; Shape shape = {2, 3, 30, 30}; - Tensor tensor = ttnn::numpy::random::random(shape); + Tensor tensor = ttnn::random::random(shape); tensor = tensor.pad_to_tile(0.0f); tensor = tensor.to(Layout::TILE); tensor = tensor.to(device); @@ -62,7 +62,7 @@ bool test_5d_tensor(Device* device) { bool pass = true; Shape shape = {2, 2, 3, 30, 30}; - Tensor tensor = ttnn::numpy::random::random(shape); + Tensor tensor = ttnn::random::random(shape); tensor = tensor.pad_to_tile(0.0f); tensor = tensor.to(Layout::TILE); tensor = tensor.to(device); @@ -75,7 +75,7 @@ bool test_6d_tensor(Device* device) { bool pass = true; Shape shape = {2, 2, 2, 3, 30, 30}; - Tensor tensor = ttnn::numpy::random::random(shape); + Tensor tensor = ttnn::random::random(shape); tensor = tensor.pad_to_tile(0.0f); tensor = tensor.to(Layout::TILE); tensor = tensor.to(device); @@ -88,7 +88,7 @@ bool test_7d_tensor(Device* device) { bool pass = true; Shape shape = {2, 2, 2, 2, 3, 30, 30}; - Tensor tensor = ttnn::numpy::random::random(shape); + Tensor tensor = ttnn::random::random(shape); tensor = tensor.pad_to_tile(0.0f); tensor = tensor.to(Layout::TILE); tensor = tensor.to(device); @@ -101,7 +101,7 @@ bool test_8d_tensor(Device* device) { bool pass = true; Shape shape = {2, 2, 2, 2, 2, 3, 30, 30}; - Tensor tensor = ttnn::numpy::random::random(shape); + Tensor tensor = ttnn::random::random(shape); tensor = tensor.pad_to_tile(0.0f); tensor = tensor.to(Layout::TILE); tensor = tensor.to(device); diff --git a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp index d29570a1e663..e3762d2ae29e 100644 --- a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp +++ b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp @@ -15,7 +15,7 @@ #include "ttnn/operations/eltwise/binary/binary.hpp" #include "ttnn/operations/eltwise/unary/unary.hpp" #include "tt_metal/host_api.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" /* @@ -42,22 +42,18 @@ */ -namespace numpy { - template -struct ndarray { +struct NDArray { tt::tt_metal::LegacyShape shape; void* data; - ndarray(tt::tt_metal::LegacyShape shape) : + NDArray(tt::tt_metal::LegacyShape shape) : shape(shape), data(malloc(tt::tt_metal::compute_volume(shape) * sizeof(DataType))) {} - ~ndarray() { free(data); } + ~NDArray() { free(data); } std::size_t size() const { return tt::tt_metal::compute_volume(shape); } }; -} // namespace numpy - void test_raw_host_memory_pointer() { using tt::tt_metal::BorrowedStorage; using tt::tt_metal::DataType; @@ -81,7 +77,7 @@ void test_raw_host_memory_pointer() { /* Borrow Data from Numpy Start */ // Create some - auto a_np_array = numpy::ndarray(shape); + auto a_np_array = NDArray(shape); void* a_np_array_data = a_np_array.data; auto on_creation_callback = [] {}; auto on_destruction_callback = [] {}; @@ -154,7 +150,7 @@ void test_raw_host_memory_pointer() { free(storage_of_alternative_tensor_for_printing); /* Alternative Way to Print End */ - auto d_np_array = numpy::ndarray(shape); + auto d_np_array = NDArray(shape); void* d_np_array_data = d_np_array.data; Tensor d_cpu = Tensor( BorrowedStorage{ diff --git a/tests/tt_metal/test_utils/comparison.hpp b/tests/tt_metal/test_utils/comparison.hpp index 8d785ae12ba4..c9a70212d7ea 100644 --- a/tests/tt_metal/test_utils/comparison.hpp +++ b/tests/tt_metal/test_utils/comparison.hpp @@ -20,8 +20,8 @@ namespace test_utils { //! to_packed() - get packed (into an integral type that is of the bitwidth specified by SIZEOF) //! Constructor(float in) - constructor with a float as the initializer //! Constructor(uint32_t in) - constructor with a uint32_t as the initializer -- only lower bits needed - -// this follows the implementation of numpy::is_close +// +// this follows the implementation of numpy's is_close template bool is_close(const ValueType a, const ValueType b, float rtol = 0.01f, float atol = 0.001f) { float af = 0.0f; diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp index 9103b27b0982..e98fbf993bd2 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp @@ -9,7 +9,7 @@ #include "ttnn/device.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/async_runtime.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "tt_metal/common/logger.hpp" #include "common_tensor_test_utils.hpp" diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp index 4e413a576a25..c7a3f077c73c 100644 --- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp +++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp @@ -8,7 +8,7 @@ #include "ttnn/device.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/async_runtime.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "tt_metal/common/logger.hpp" #include "ttnn/tensor/tensor.hpp" diff --git a/tests/ttnn/unit_tests/gtests/test_add.cpp b/tests/ttnn/unit_tests/gtests/test_add.cpp index 579aa08e4bd2..84fda89c066b 100644 --- a/tests/ttnn/unit_tests/gtests/test_add.cpp +++ b/tests/ttnn/unit_tests/gtests/test_add.cpp @@ -36,8 +36,7 @@ TEST_P(Add1DTensorAndScalarFixture, AddsScalarCorrectly) { const auto expected_tensor = ttnn::operations::creation::full(shape, param.scalar, DataType::BFLOAT16, ttnn::TILE_LAYOUT, device); TT_FATAL( - ttnn::numpy::allclose<::bfloat16>(ttnn::from_device(expected_tensor), ttnn::from_device(output_tensor)), - "Error"); + ttnn::allclose<::bfloat16>(ttnn::from_device(expected_tensor), ttnn::from_device(output_tensor)), "Error"); } ttnn::close_device(device); } diff --git a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp index 5da2cde8f93f..c15631eba3e7 100644 --- a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp +++ b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp @@ -9,7 +9,7 @@ #include "ttnn/operations/eltwise/unary/unary.hpp" #include "common/bfloat16.hpp" #include "ttnn/async_runtime.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "tt_metal/impl/event/event.hpp" #include diff --git a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp index 038401cd0ffc..6a4a1bc58abd 100644 --- a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp +++ b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp @@ -9,7 +9,7 @@ #include "ttnn/operations/eltwise/binary/binary.hpp" #include "common/bfloat16.hpp" #include "ttnn/async_runtime.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "tt_metal/impl/event/event.hpp" #include #include diff --git a/tt_metal/common/bfloat16.hpp b/tt_metal/common/bfloat16.hpp index e81d5059e6bd..c03a58513cf7 100644 --- a/tt_metal/common/bfloat16.hpp +++ b/tt_metal/common/bfloat16.hpp @@ -340,7 +340,7 @@ inline bool equal_within_n_sig_figs(float a, float b, int n) { inline bool equal_within_absolute_tolerance(float a, float b, float tol) { return std::abs(a - b) < tol; } -// this follows the implementation of numpy::is_close +// this follows the implementation of numpy's is_close inline bool is_close(float a, float b, float rtol = 0.01f, float atol = 0.001f) { // the idea is near zero we want absolute tolerance since relative doesn't make sense // (consider 1e-6f and 1.1e-6f) diff --git a/ttnn/cpp/ttnn/operations/creation.hpp b/ttnn/cpp/ttnn/operations/creation.hpp index 3267e2dab295..e7f5a1198f45 100644 --- a/ttnn/cpp/ttnn/operations/creation.hpp +++ b/ttnn/cpp/ttnn/operations/creation.hpp @@ -13,7 +13,7 @@ #include "ttnn/decorators.hpp" #include "ttnn/distributed/types.hpp" #include "ttnn/operations/eltwise/unary/unary.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/any_device.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/tensor_utils.hpp" diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp index 50b069c43c21..0910eb284cfa 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.cpp @@ -6,7 +6,7 @@ #include "ttnn/run_operation.hpp" #include "reshape.hpp" #include "tt_metal/common/constants.hpp" -#include +#include #include "ttnn/operations/experimental/auto_format/auto_format.hpp" #include "ttnn/tensor/tensor_utils.hpp" #include "device/reshape_op.hpp" diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index 904fdf88a1f1..eddb977d02b0 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -9,7 +9,7 @@ #include "reshape_common.hpp" #include "tt_metal/common/constants.hpp" #include -#include +#include #include "ttnn/operations/experimental/auto_format/auto_format.hpp" #include "ttnn/tensor/tensor_utils.hpp" #include "ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/reshape.hpp" diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp index 7105ccf31cc0..a30bdc2cc5ee 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp @@ -12,7 +12,7 @@ #include "tt_metal/common/bfloat16.hpp" #include "ttnn/operations/data_movement/reshape_on_device/reshape.hpp" #include "ttnn/operations/data_movement/bcast/bcast.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/operations/data_movement/slice/slice.hpp" #include "ttnn/operations/eltwise/unary/unary_composite.hpp" #include "ttnn/operations/eltwise/binary/binary_composite.hpp" @@ -721,7 +721,7 @@ Tensor _swiglu(const Tensor& input_a, int32_t dim, const std::optional& output_mem_config) { - Tensor index_l = numpy::index_tril<::bfloat16>( + Tensor index_l = ttnn::index_tril<::bfloat16>( input_a.get_legacy_shape(), diag, DataType::BFLOAT16, @@ -733,7 +733,7 @@ Tensor _tril(const Tensor& input_a, int32_t diag, const std::optional& output_mem_config) { - Tensor index_u = numpy::index_triu<::bfloat16>( + Tensor index_u = ttnn::index_triu<::bfloat16>( input_a.get_legacy_shape(), diag, DataType::BFLOAT16, diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp index a6a9e50998b4..f6f8db7c40c8 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp @@ -1863,7 +1863,7 @@ std::vector ExecuteUnaryBackwardProd::invoke( if (all_dimensions == true) { Tensor temp = ttnn::multiply( prod_result, grad, std::nullopt, output_memory_config); // result is stored in the first position - Tensor fill_tensor = numpy::fill_first_val_into_tensor<::bfloat16>( + Tensor fill_tensor = ttnn::fill_first_val_into_tensor<::bfloat16>( temp, temp.get_dtype(), temp.get_layout(), temp.device(), output_memory_config); Tensor all_dimension_result = ttnn::multiply( ttnn::reciprocal(input, output_memory_config), fill_tensor, std::nullopt, output_memory_config); diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp index ec0f2eae6106..205198c1d317 100644 --- a/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/reduction/argmax/argmax.cpp @@ -21,7 +21,7 @@ Tensor create_mask(const Tensor& input_a, const std::optional& out return input_a; } float t_inf = -std::numeric_limits::infinity(); - Tensor masked_input = numpy::mask_padded_input<::bfloat16>(padded_shape, unpadded_shape, DataType::BFLOAT16); + Tensor masked_input = ttnn::mask_padded_input<::bfloat16>(padded_shape, unpadded_shape, DataType::BFLOAT16); masked_input = ttnn::where(masked_input, input_a, t_inf, output_mem_config.value()); return masked_input; } @@ -49,12 +49,12 @@ Tensor ArgmaxOperation::invoke( bool is_width = (dim == (input_shape.rank() - 1)); Tensor max_val = ttnn::max(input_a, (int)dim, true, output_memory_config); Tensor max_tensor = ttnn::zeros_like(input_a); - Tensor tindex = numpy::index_width<::bfloat16>( + Tensor tindex = ttnn::index_width<::bfloat16>( input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_memory_config); if (is_width) { max_tensor = ttnn::add(max_tensor, max_val, std::nullopt, output_memory_config); } else { - tindex = numpy::index_height<::bfloat16>( + tindex = ttnn::index_height<::bfloat16>( input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_memory_config); max_tensor = ttnn::add(max_tensor, max_val, std::nullopt, output_memory_config); } @@ -93,10 +93,10 @@ Tensor ArgmaxOperation::invoke( concat_out = ttnn::reshape(concat_out, input_a.get_shape()); Tensor cmp_results = ttnn::eq(input_a, concat_out, std::nullopt, output_memory_config); concat_out.deallocate(); - Tensor tindex = numpy::index_channel<::bfloat16>( + Tensor tindex = ttnn::index_channel<::bfloat16>( input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_memory_config); if (!is_channel) { - tindex = numpy::index_batch<::bfloat16>( + tindex = ttnn::index_batch<::bfloat16>( input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_memory_config); } tindex = tindex.to(input_a.device()); @@ -119,7 +119,7 @@ Tensor ArgmaxOperation::invoke( } // TODO: Fix the index generation code. With the fix the code will work for argmax that return entire // maximum value index - Tensor tindex = numpy::index_all<::bfloat16>( + Tensor tindex = ttnn::index_all<::bfloat16>( input_shape, DataType::BFLOAT16, Layout::TILE, input_a.device(), output_memory_config); Tensor max_val = ttnn::max(input_a, std::nullopt, true, output_memory_config); Tensor max_tensor = ttnn::zeros_like(input_a); diff --git a/ttnn/cpp/ttnn/operations/numpy/functions.hpp b/ttnn/cpp/ttnn/operations/functions.hpp similarity index 99% rename from ttnn/cpp/ttnn/operations/numpy/functions.hpp rename to ttnn/cpp/ttnn/operations/functions.hpp index 51a3668eed1a..ed0d598e9906 100644 --- a/ttnn/cpp/ttnn/operations/numpy/functions.hpp +++ b/ttnn/cpp/ttnn/operations/functions.hpp @@ -17,8 +17,6 @@ namespace ttnn { -namespace numpy { - using tt::tt_metal::DataType; using tt::tt_metal::Device; using tt::tt_metal::Layout; @@ -554,12 +552,12 @@ static bool allclose(const Tensor& tensor_a, const Tensor& tensor_b, Args... arg auto tensor_b_buffer = tt::tt_metal::owned_buffer::get_as(tensor_b); for (int index = 0; index < tensor_a_buffer.size(); index++) { - if (not detail::nearly_equal(tensor_a_buffer[index], tensor_b_buffer[index], args...)) { + using ::ttnn::detail::nearly_equal; + if (not nearly_equal(tensor_a_buffer[index], tensor_b_buffer[index], args...)) { return false; } } return true; } -} // namespace numpy } // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp index 7c1903d8d635..1f07bc081f52 100644 --- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp +++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp @@ -15,7 +15,7 @@ #include "ttnn/operations/reduction/generic/device/reduce_op.hpp" // for reduce_op_utils #include "tt_metal/tt_stl/reflection.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/operations/sliding_window/sliding_window.hpp" #include "ttnn/operations/sliding_window/halo/halo.hpp" diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp index ca0e4288dd50..7603863c7d90 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_op_all.cpp @@ -8,7 +8,7 @@ #include "prod_op_all.hpp" #include "ttnn/operations/eltwise/unary/unary.hpp" #include "tt_metal/common/constants.hpp" -#include +#include #include "tt_metal/host_api.hpp" #include "tt_metal/tools/profiler/op_profiler.hpp" @@ -45,11 +45,11 @@ Tensor prod_all(const Tensor& input, const MemoryConfig& output_mem_config) { operation::run(Prod_op{.output_mem_config = output_mem_config}, {input}).at(0), output_mem_config); auto arch_env = tt_ClusterDescriptor::detect_arch((chip_id_t)0); if (arch_env == tt::ARCH::WORMHOLE_B0) { - return ttnn::numpy::prod_result_computation_WH_B0( + return ttnn::prod_result_computation_WH_B0( result, result.get_dtype(), result.get_layout(), result.device(), output_mem_config); } // else --> GS Arch - return ttnn::numpy::prod_result_computation_GS( + return ttnn::prod_result_computation_GS( result, result.get_dtype(), result.get_layout(), result.device(), output_mem_config); } diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp index 3593bf99eabc..ae3307c6dd76 100644 --- a/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp +++ b/ttnn/cpp/ttnn/operations/reduction/prod/prod.cpp @@ -9,7 +9,7 @@ #include "ttnn/cpp/ttnn/operations/creation.hpp" #include "ttnn/operations/data_movement/slice/slice.hpp" #include "ttnn/operations/data_movement/permute/permute.hpp" -#include "ttnn/operations/numpy/functions.hpp" +#include "ttnn/operations/functions.hpp" #include "ttnn/types.hpp" #include "ttnn/common/constants.hpp" From d679e662d8bfbed61d4e249b2d5095968187bf7a Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Tue, 10 Dec 2024 14:42:51 -0500 Subject: [PATCH 35/59] #14393: Use python3 (#15867) ### Ticket Progress towards https://github.com/tenstorrent/tt-metal/issues/14393 ### Problem description Not all systems can be assumed to have `python`. Call out `python3` as that's what we need. ### What's changed env python -> env python3 ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12260302459 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- models/perf/perf_report.py | 2 +- tests/scripts/run_cpp_unit_tests.sh | 8 ++++---- tests/scripts/run_performance.sh | 8 ++++---- tests/scripts/t3000/run_t3000_model_perf_tests.sh | 4 ++-- tests/scripts/tg/run_tg_model_perf_tests.sh | 4 ++-- tests/scripts/tgg/run_tgg_model_perf_tests.sh | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/models/perf/perf_report.py b/models/perf/perf_report.py index 379c27c0ce40..13a20e3db521 100755 --- a/models/perf/perf_report.py +++ b/models/perf/perf_report.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/scripts/run_cpp_unit_tests.sh b/tests/scripts/run_cpp_unit_tests.sh index 8179168ac794..ad74b3333aa4 100755 --- a/tests/scripts/run_cpp_unit_tests.sh +++ b/tests/scripts/run_cpp_unit_tests.sh @@ -22,16 +22,16 @@ rm -rf $kernel_path ./build/test/tt_metal/distributed/distributed_unit_tests --gtest_filter=MeshDeviceSuite.* if [[ ! -z "$TT_METAL_SLOW_DISPATCH_MODE" ]]; then - env python tests/scripts/run_tt_metal.py --dispatch-mode slow - env python tests/scripts/run_tt_eager.py --dispatch-mode slow + env python3 tests/scripts/run_tt_metal.py --dispatch-mode slow + env python3 tests/scripts/run_tt_eager.py --dispatch-mode slow else TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=MultiCommandQueue*Fixture.* # Enable this on BH after #14613 if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then TT_METAL_GTEST_ETH_DISPATCH=1 ./build/test/tt_metal/unit_tests_dispatch fi - env python tests/scripts/run_tt_eager.py --dispatch-mode fast - env python tests/scripts/run_tt_metal.py --dispatch-mode fast + env python3 tests/scripts/run_tt_eager.py --dispatch-mode fast + env python3 tests/scripts/run_tt_metal.py --dispatch-mode fast fi # Tool tests use C++ unit tests so include them here. diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index 7c42512474db..514bf13510c7 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -44,7 +44,7 @@ run_perf_models_other() { env pytest -n auto models/demos/squeezebert/tests/test_performance.py -m $test_marker ## Merge all the generated reports - env python models/perf/merge_perf_results.py + env python3 models/perf/merge_perf_results.py } run_perf_models_llm_javelin() { @@ -64,7 +64,7 @@ run_perf_models_llm_javelin() { env pytest -n auto models/demos/wormhole/mamba/tests -m $test_marker fi ## Merge all the generated reports - env python models/perf/merge_perf_results.py + env python3 models/perf/merge_perf_results.py } run_perf_models_cnn_javelin() { @@ -76,7 +76,7 @@ run_perf_models_cnn_javelin() { env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=480 ## Merge all the generated reports - env python models/perf/merge_perf_results.py + env python3 models/perf/merge_perf_results.py } run_device_perf_models() { @@ -128,7 +128,7 @@ run_device_perf_models() { fi ## Merge all the generated reports - env python models/perf/merge_device_perf_results.py + env python3 models/perf/merge_device_perf_results.py } run_device_perf_ops() { diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index eff50354e04e..0b51812929a0 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -144,7 +144,7 @@ run_t3000_llm_tests() { run_t3000_falcon40b_tests # Merge all the generated reports - env python models/perf/merge_perf_results.py + env python3 models/perf/merge_perf_results.py } run_t3000_cnn_tests() { @@ -152,7 +152,7 @@ run_t3000_cnn_tests() { run_t3000_resnet50_tests # Merge all the generated reports - env python models/perf/merge_perf_results.py + env python3 models/perf/merge_perf_results.py } run_t3000_ccl_tests() { diff --git a/tests/scripts/tg/run_tg_model_perf_tests.sh b/tests/scripts/tg/run_tg_model_perf_tests.sh index d86a7a966888..cf8f9f14c6e8 100755 --- a/tests/scripts/tg/run_tg_model_perf_tests.sh +++ b/tests/scripts/tg/run_tg_model_perf_tests.sh @@ -7,7 +7,7 @@ run_tg_llm_tests() { pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_tg" --timeout=600 ; fail+=$? # Merge all the generated reports - env python models/perf/merge_perf_results.py; fail+=$? + env python3 models/perf/merge_perf_results.py; fail+=$? if [[ $fail -ne 0 ]]; then echo "LOG_METAL: run_tg_model_perf_tests failed" @@ -21,7 +21,7 @@ run_tg_cnn_tests() { env pytest -n auto models/demos/tg/resnet50/tests/test_perf_e2e_resnet50.py -m "model_perf_tg" ; fail+=$? # Merge all the generated reports - env python models/perf/merge_perf_results.py; fail+=$? + env python3 models/perf/merge_perf_results.py; fail+=$? if [[ $fail -ne 0 ]]; then echo "LOG_METAL: run_tg_model_perf_tests failed" diff --git a/tests/scripts/tgg/run_tgg_model_perf_tests.sh b/tests/scripts/tgg/run_tgg_model_perf_tests.sh index c75b5ea6cabc..56d43955f020 100755 --- a/tests/scripts/tgg/run_tgg_model_perf_tests.sh +++ b/tests/scripts/tgg/run_tgg_model_perf_tests.sh @@ -3,7 +3,7 @@ set -eo pipefail run_tgg_llm_tests() { # Merge all the generated reports - env python models/perf/merge_perf_results.py + env python3 models/perf/merge_perf_results.py } run_tgg_cnn_tests() { @@ -12,7 +12,7 @@ run_tgg_cnn_tests() { env pytest -n auto models/demos/tgg/resnet50/tests/test_perf_e2e_resnet50.py -m "model_perf_tgg" --timeout=900 ; fail+=$? # Merge all the generated reports - env python models/perf/merge_perf_results.py + env python3 models/perf/merge_perf_results.py } main() { From 2a8e596a692bb1a6427ee4179e200c45ec6da20b Mon Sep 17 00:00:00 2001 From: asaigal Date: Fri, 22 Nov 2024 15:40:33 -0500 Subject: [PATCH 36/59] #15809: Add SW Support for Coordinate Virtualization on WH - Only Logical and Virtual Coordinates are now exposed to users - Tensix and Ethernet can be virtualized on WH. Users can only access DRAM controller end-points through Logical Coordinates/ Bank IDs. Logical to Physical Translation for DRAM is done on device through the get_noc_addr_from_bank_id API - GS does not support Virtual Coordinates. BH will support this feature in future, but is currently blocked by Syseng. For these archs, Virtual and Physical coordinates are identical. Hence, on BH, users temporarily have access to Physical Coordinates - APIs returning Physical Coordinates have either been deprecated or migrated to return Virtual Coordinates instead - Modified Host/Device Runtime, TTNN Ops, Debug Tools and tests to support Virtual Coordinates - Unlocks offline compilation and Fast Dispatch Command Generation + Program Reuse and broadcasts (key features for TT-Mesh) --- .github/workflows/all-static-checks.yaml | 2 +- CONTRIBUTING.md | 36 +- docs/source/tt-metalium/tools/watcher.rst | 2 +- .../tt_metal/examples/dram_loopback.rst | 9 +- .../tt_metal/examples/eltwise_sfpu.rst | 3 +- ...al_Add_Two_Integers_in_a_Compute_Kernel.md | 39 +- .../add_2_integers_in_compute.md | 20 +- .../add_2_integers_in_riscv.md | 8 +- .../dram_loopback/dram_loopback.md | 10 +- .../eltwise_sfpu/eltwise_sfpu.md | 3 +- .../kernels/dataflow/reader_unary_8bank.cpp | 2 +- .../kernels/dataflow/reader_unary_push_4.cpp | 11 +- .../kernels/dataflow/writer_unary_8bank.cpp | 4 +- tests/tt_eager/ops/test_sfpu.cpp | 8 +- .../test_CircularBuffer_allocation.cpp | 6 +- tests/tt_metal/tt_metal/api/test_banked.cpp | 28 +- tests/tt_metal/tt_metal/api/test_direct.cpp | 24 +- tests/tt_metal/tt_metal/api/test_dram.cpp | 43 +- .../api/test_dram_to_l1_multicast.cpp | 4 +- .../tt_metal/api/test_global_semaphores.cpp | 5 +- tests/tt_metal/tt_metal/api/test_noc.cpp | 14 +- .../tt_metal/api/test_simple_l1_buffer.cpp | 20 +- .../tt_metal/api/test_soc_descriptor.cpp | 3 +- .../dprint/test_print_tensix_dest.cpp | 3 +- .../debug_tools/dprint/test_print_tiles.cpp | 7 +- .../debug_tools/watcher/test_assert.cpp | 27 +- .../debug_tools/watcher/test_noc_sanitize.cpp | 191 ++--- .../watcher/test_noc_sanitize_delays.cpp | 22 +- .../debug_tools/watcher/test_pause.cpp | 9 +- .../debug_tools/watcher/test_ringbuf.cpp | 20 +- .../debug_tools/watcher/test_waypoint.cpp | 5 +- .../device/test_device_cluster_api.cpp | 72 +- .../dispatch_program/test_EnqueueProgram.cpp | 17 +- .../dispatch_program/test_dispatch.cpp | 6 +- .../dispatch_trace/test_EnqueueTrace.cpp | 10 +- .../tt_metal/tt_metal/eth/test_basic_eth.cpp | 34 +- .../eth/test_buffer_movement_kernels.cpp | 15 +- .../integration/matmul/test_matmul_X_tile.cpp | 28 +- .../matmul/test_matmul_large_block.cpp | 19 +- .../matmul/test_matmul_multi_core_X_dram.cpp | 13 +- .../matmul/test_matmul_single_core.cpp | 13 +- .../test_autonomous_relay_streams.cpp | 16 +- .../integration/test_basic_pipeline.cpp | 16 +- .../tt_metal/integration/test_flatten.cpp | 31 +- .../integration/test_sfpu_compute.cpp | 8 +- .../tt_metal/tt_metal/llk/test_broadcast.cpp | 31 +- .../llk/test_copy_block_matmul_partials.cpp | 9 +- tests/tt_metal/tt_metal/llk/test_cumsum.cpp | 20 +- .../llk/test_dropout_sfpu_compute.cpp | 6 +- tests/tt_metal/tt_metal/llk/test_reconfig.cpp | 26 +- tests/tt_metal/tt_metal/llk/test_reduce.cpp | 16 +- .../tt_metal/llk/test_sfpu_compute.cpp | 8 +- .../llk/test_single_core_binary_compute.cpp | 16 +- .../llk/test_single_core_matmul_compute.cpp | 44 +- .../tt_metal/tt_metal/llk/test_transpose.cpp | 11 +- .../tt_metal/llk/test_untilize_tilize.cpp | 23 +- .../test_dram_read.cpp | 301 +------- .../test_dram_read_l1_write.cpp | 554 +++------------ .../perf_microbenchmark/dispatch/common.h | 62 +- .../dispatch/test_bw_and_latency.cpp | 14 +- .../dispatch/test_dispatcher.cpp | 15 +- .../dispatch/test_prefetcher.cpp | 46 +- .../test_ethernet_hop_latencies_no_edm.cpp | 12 +- ...ers_and_erisc_datamover_unidirectional.cpp | 6 +- .../routing/test_bi_tunnel.cpp | 86 +-- .../routing/test_tunnel_1cq.cpp | 172 ++--- .../routing/test_tunnel_2cq.cpp | 172 ++--- .../routing/test_uni_tunnel.cpp | 86 +-- .../routing/test_uni_tunnel_single_chip.cpp | 86 +-- tests/tt_metal/tt_metal/test_bcast.cpp | 25 +- tests/tt_metal/tt_metal/test_clean_init.cpp | 57 +- .../test_compile_sets_kernel_binaries.cpp | 2 - .../tt_metal/tt_metal/test_core_range_set.cpp | 11 +- tests/tt_metal/tt_metal/test_datacopy.cpp | 11 +- .../tt_metal/tt_metal/test_datacopy_bfp8b.cpp | 11 +- .../tt_metal/test_datacopy_output_in_l1.cpp | 11 +- tests/tt_metal/tt_metal/test_dataflow_cb.cpp | 13 +- .../test_dram_copy_sticks_multi_core.cpp | 10 +- .../test_dram_loopback_multi_core.cpp | 39 +- .../test_dram_loopback_multi_core_db.cpp | 51 +- .../test_dram_loopback_single_core.cpp | 15 +- .../test_dram_loopback_single_core_db.cpp | 21 +- .../tt_metal/test_dram_to_l1_multicast.cpp | 5 +- ...test_dram_to_l1_multicast_loopback_src.cpp | 4 +- .../tt_metal/tt_metal/test_eltwise_binary.cpp | 16 +- .../tt_metal/test_enqueue_program.cpp | 9 +- tests/tt_metal/tt_metal/test_flatten.cpp | 15 +- ...neric_binary_reader_matmul_large_block.cpp | 29 +- .../tt_metal/test_interleaved_layouts.cpp | 21 +- .../unit_tests/matmul/reader_binary.cpp | 15 +- .../matmul/reader_binary_blocked.cpp | 23 +- .../unit_tests/matmul/writer_unary.cpp | 7 +- .../dataflow/direct_reader_unary.cpp | 15 +- .../dataflow/direct_writer_unary.cpp | 15 +- .../test_kernels/dataflow/dram_copy.cpp | 16 +- .../test_kernels/dataflow/dram_copy_db.cpp | 28 +- .../dataflow/dram_copy_sticks.cpp | 15 +- .../dataflow/dram_copy_to_noc_coord.cpp | 51 ++ .../dataflow/dram_loader_sync.cpp | 21 +- .../dataflow/dram_loader_sync_db.cpp | 25 +- .../dataflow/dram_to_l1_multicast.cpp | 23 +- .../dram_to_l1_multicast_exclude_region.cpp | 37 +- .../dram_to_l1_multicast_include_src.cpp | 23 +- .../test_kernels/dataflow/flatten.cpp | 13 +- .../generic_binary_reader_blocked.cpp | 30 +- .../test_kernels/dataflow/l1_to_l1.cpp | 23 +- .../test_kernels/dataflow/reader_bcast_h.cpp | 24 +- .../dataflow/reader_bcast_h_8bank.cpp | 16 +- .../dataflow/reader_bcast_hw_8bank.cpp | 16 +- .../test_kernels/dataflow/reader_bcast_w.cpp | 19 +- .../dataflow/reader_bcast_w_8bank.cpp | 16 +- .../test_kernels/dataflow/reader_binary.cpp | 35 +- .../test_kernels/dataflow/reader_cb_test.cpp | 30 +- .../dataflow/reader_dual_8bank.cpp | 8 +- .../dataflow/reader_first_stage.cpp | 9 +- .../dataflow/reader_matmul_blocked.cpp | 26 +- .../dataflow/reader_matmul_small_block.cpp | 16 +- .../reader_matmul_with_bias_blocked.cpp | 42 +- .../dataflow/reader_unary_push_4.cpp | 11 +- .../dataflow/reader_unary_push_n.cpp | 17 +- .../dataflow/reader_unary_transpose_wh.cpp | 13 +- .../remote_read_remote_write_sync.cpp | 21 +- .../remote_read_remote_write_sync_db.cpp | 29 +- .../test_kernels/dataflow/transpose_hc.cpp | 17 +- .../dataflow/transpose_hc_8bank.cpp | 15 +- .../dram/direct_reader_dram_to_l1.cpp | 10 +- .../unit_tests/dram/direct_reader_unary.cpp | 11 +- .../dram/direct_writer_l1_to_dram.cpp | 11 +- .../unit_tests/dram/direct_writer_unary.cpp | 9 +- .../erisc/direct_dram_to_dram_receiver.cpp | 13 +- .../erisc/direct_dram_to_dram_sender.cpp | 13 +- .../erisc/direct_reader_dram_to_l1.cpp | 10 +- .../erisc/direct_writer_l1_to_dram.cpp | 10 +- .../test_kernels/dataflow/writer_binary.cpp | 22 +- .../test_kernels/dataflow/writer_cb_test.cpp | 30 +- .../dataflow/writer_last_stage.cpp | 7 +- .../dataflow/writer_unary_8bank.cpp | 4 +- .../dataflow/writer_unary_pop_n.cpp | 15 +- .../dataflow/writer_unary_transpose_wh.cpp | 10 +- .../dataflow/writer_unswizzle.cpp | 23 +- .../tt_metal/test_kernels/misc/print_tile.cpp | 8 +- .../test_kernels/misc/watcher_asserts.cpp | 7 +- .../tt_metal/test_l1_to_l1_multi_core.cpp | 18 +- .../tt_metal/test_matmul_large_block.cpp | 19 +- .../test_matmul_multi_core_single_dram.cpp | 13 +- .../tt_metal/test_matmul_multi_tile.cpp | 25 +- .../tt_metal/test_matmul_single_core.cpp | 13 +- .../test_matmul_single_core_small.cpp | 13 +- .../tt_metal/test_matmul_single_tile.cpp | 18 +- .../test_matmul_single_tile_bfp8b.cpp | 27 +- .../test_matmul_single_tile_output_in_l1.cpp | 26 +- .../tt_metal/test_multi_core_kernel.cpp | 51 +- .../tt_metal/test_multiple_programs.cpp | 18 +- tests/tt_metal/tt_metal/test_transpose_hc.cpp | 23 +- .../tt_metal/test_untilize_eltwise_binary.cpp | 20 +- tt_metal/common/CMakeLists.txt | 1 + tt_metal/common/core_assignment.cpp | 230 ++++++ tt_metal/common/core_assignment.hpp | 23 + tt_metal/common/core_descriptor.hpp | 9 - tt_metal/hw/firmware/src/brisc.cc | 14 +- tt_metal/hw/firmware/src/erisc.cc | 14 +- tt_metal/hw/firmware/src/idle_erisc.cc | 7 +- .../hw/inc/blackhole/noc/noc_parameters.h | 5 + .../hw/inc/blackhole/noc_nonblocking_api.h | 4 + tt_metal/hw/inc/dataflow_api.h | 14 + tt_metal/hw/inc/debug/sanitize_noc.h | 97 ++- tt_metal/hw/inc/dev_msgs.h | 12 +- .../hw/inc/grayskull/noc/noc_parameters.h | 5 + .../hw/inc/grayskull/noc_nonblocking_api.h | 3 + tt_metal/hw/inc/risc_common.h | 4 +- tt_metal/hw/inc/wormhole/noc/noc_parameters.h | 6 + .../hw/inc/wormhole/noc_nonblocking_api.h | 8 +- tt_metal/impl/allocator/allocator.cpp | 4 +- tt_metal/impl/allocator/allocator_types.hpp | 4 +- .../impl/allocator/l1_banking_allocator.cpp | 12 +- tt_metal/impl/buffers/buffer.cpp | 36 +- tt_metal/impl/buffers/buffer.hpp | 6 +- tt_metal/impl/debug/dprint_server.cpp | 12 +- tt_metal/impl/debug/noc_logging.cpp | 4 +- tt_metal/impl/debug/sanitize_noc_host.hpp | 34 +- tt_metal/impl/debug/watcher_device_reader.cpp | 18 +- tt_metal/impl/debug/watcher_server.cpp | 2 +- tt_metal/impl/device/device.cpp | 658 ++++++++++-------- tt_metal/impl/device/device.hpp | 63 +- tt_metal/impl/dispatch/command_queue.cpp | 55 +- tt_metal/impl/dispatch/command_queue.hpp | 2 +- .../impl/dispatch/command_queue_interface.hpp | 16 +- .../impl/dispatch/dispatch_core_manager.hpp | 4 +- .../impl/dispatch/kernels/cq_dispatch.cpp | 5 +- .../impl/dispatch/kernels/cq_prefetch.cpp | 3 +- tt_metal/impl/program/program.cpp | 23 +- .../impl/sub_device/sub_device_manager.cpp | 19 +- .../dataflow/reader_binary_diff_lengths.cpp | 18 +- tt_metal/kernels/dataflow/reader_unary.cpp | 9 +- tt_metal/kernels/dataflow/writer_unary.cpp | 10 +- tt_metal/kernels/dataflow/writer_unary_1.cpp | 30 + tt_metal/llrt/blackhole/bh_hal.cpp | 5 +- tt_metal/llrt/grayskull/gs_hal.cpp | 5 +- tt_metal/llrt/hal.hpp | 6 + tt_metal/llrt/llrt.cpp | 11 +- tt_metal/llrt/llrt.hpp | 14 +- tt_metal/llrt/tt_cluster.cpp | 191 ++++- tt_metal/llrt/tt_cluster.hpp | 35 +- tt_metal/llrt/wormhole/wh_hal.cpp | 5 +- .../add_2_integers_in_compute.cpp | 22 +- .../kernels/dataflow/reader_binary_1_tile.cpp | 10 +- .../kernels/dataflow/writer_1_tile.cpp | 5 +- .../add_2_integers_in_riscv.cpp | 30 +- .../kernels/reader_writer_add_in_riscv.cpp | 15 +- .../eltwise_binary/eltwise_binary.cpp | 35 +- .../eltwise_sfpu/eltwise_sfpu.cpp | 15 +- .../loopback/kernels/loopback_dram_copy.cpp | 16 +- .../loopback/loopback.cpp | 10 +- tt_metal/tools/profiler/profiler.cpp | 6 +- tt_metal/tools/profiler/tt_metal_profiler.cpp | 26 +- tt_metal/tt_metal.cpp | 72 +- .../device/host/barrier_full_worker_grid.cpp | 8 +- .../common/types/ccl_types_args_emitters.cpp | 4 +- .../conv2d_op_sharded_program_factory.cpp | 3 - ...onv2d_op_width_sharded_program_factory.cpp | 3 - .../dataflow/reshard_same_height_reader.cpp | 6 +- .../dataflow/reshard_same_height_writer.cpp | 6 +- .../dataflow/reshard_same_width_reader.cpp | 10 +- .../dataflow/reshard_same_width_writer.cpp | 10 +- .../device/reshard_program_factory.cpp | 54 +- ...use_mcast_dram_sharded_program_factory.cpp | 397 +---------- 226 files changed, 3010 insertions(+), 3869 deletions(-) create mode 100644 tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp create mode 100644 tt_metal/common/core_assignment.cpp create mode 100644 tt_metal/common/core_assignment.hpp create mode 100644 tt_metal/kernels/dataflow/writer_unary_1.cpp diff --git a/.github/workflows/all-static-checks.yaml b/.github/workflows/all-static-checks.yaml index a9f1fb939916..785fe2c6e573 100644 --- a/.github/workflows/all-static-checks.yaml +++ b/.github/workflows/all-static-checks.yaml @@ -58,7 +58,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Check kernel count in base metal is less than maximum - run: if (( $(find tt_metal/kernels/ -type f | wc -l) > 7 )); then exit 1; fi + run: if (( $(find tt_metal/kernels/ -type f | wc -l) > 8 )); then exit 1; fi check-doc: runs-on: ubuntu-latest steps: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 946c69f3720b..e6b3a690fc2f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -341,13 +341,13 @@ Breakpoint 1, tt::tt_metal::Device::Device (this=0x3c, device_id=21845, num_hw_c TT_METAL_WATCHER=10 ./your_program ... Always | WARNING | Watcher detected NOC error and stopped device: bad alignment in NOC transaction. - Always | WARNING | Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010] + Always | WARNING | Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010] Always | INFO | Last waypoint: NARW, W, W, W, W Always | INFO | While running kernels: Always | INFO | brisc : tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp Always | INFO | ncrisc: blank Always | INFO | triscs: blank - Test | INFO | Reported error: Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010] + Test | INFO | Reported error: Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010] Always | FATAL | Watcher detected NOC error and stopped device: bad alignment in NOC transaction. ``` - If no such error is reported, but the program is hanging, check the watcher log generated in `generated/watcher/watcher.log`. There is a legend at the top of the log showing how to interpret it, and a sample portion of a log is shown below: @@ -371,22 +371,22 @@ Legend: k_ids:|| (ID map to file at end of section) ... Dump #7 at 8.992s -Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 1,y= 0) phys(x= 2,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 2,y= 0) phys(x= 3,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 3,y= 0) phys(x= 4,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 4,y= 0) phys(x= 6,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 5,y= 0) phys(x= 7,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 6,y= 0) phys(x= 8,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 7,y= 0) phys(x= 9,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 0,y= 7) phys(x= 1,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0 -Device 0 worker core(x= 1,y= 7) phys(x= 2,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0 -Device 0 worker core(x= 2,y= 7) phys(x= 3,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|3|0 -Device 0 worker core(x= 3,y= 7) phys(x= 4,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|7|0 -Device 0 worker core(x= 4,y= 7) phys(x= 6,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:4|0|0 -Device 0 worker core(x= 5,y= 7) phys(x= 7,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:6|0|0 -Device 0 worker core(x= 6,y= 7) phys(x= 8,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0 -Device 0 worker core(x= 7,y= 7) phys(x= 9,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0 +Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 1,y= 0) virtual(x= 2,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 2,y= 0) virtual(x= 3,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 3,y= 0) virtual(x= 4,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 4,y= 0) virtual(x= 6,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 5,y= 0) virtual(x= 7,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 6,y= 0) virtual(x= 8,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 7,y= 0) virtual(x= 9,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 0,y= 7) virtual(x= 1,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0 +Device 0 worker core(x= 1,y= 7) virtual(x= 2,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0 +Device 0 worker core(x= 2,y= 7) virtual(x= 3,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|3|0 +Device 0 worker core(x= 3,y= 7) virtual(x= 4,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|7|0 +Device 0 worker core(x= 4,y= 7) virtual(x= 6,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:4|0|0 +Device 0 worker core(x= 5,y= 7) virtual(x= 7,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:6|0|0 +Device 0 worker core(x= 6,y= 7) virtual(x= 8,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0 +Device 0 worker core(x= 7,y= 7) virtual(x= 9,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0 k_id[0]: blank k_id[1]: tt_metal/impl/dispatch/kernels/cq_prefetch.cpp k_id[2]: tt_metal/impl/dispatch/kernels/cq_dispatch.cpp diff --git a/docs/source/tt-metalium/tools/watcher.rst b/docs/source/tt-metalium/tools/watcher.rst index 9962f20b1f9d..3dd71f3c0fdb 100644 --- a/docs/source/tt-metalium/tools/watcher.rst +++ b/docs/source/tt-metalium/tools/watcher.rst @@ -217,7 +217,7 @@ per RISC in the log. If a stack overflow is detected, the core will hang and an .. code-block:: - Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): GW, W, W, W, W rmsg:D1D|BNt smsg:DDDD k_ids:11|10|0 + Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): GW, W, W, W, W rmsg:D1D|BNt smsg:DDDD k_ids:11|10|0 brisc stack usage: 228/768, kernel using most stack: ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_gn_v2.cpp ncrisc stack usage: 192/768, kernel using most stack: ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reader_unary_sharded_blocks_interleaved_start_id.cpp trisc0 stack usage: 252/320, kernel using most stack: ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/compute/groupnorm_sharded_v2.cpp diff --git a/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst b/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst index e1c606211033..52f288a48e49 100644 --- a/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst +++ b/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst @@ -112,6 +112,9 @@ Let's make the input and output DRAM buffers. Buffer output_dram_buffer = CreateBuffer(dram_config); const uint32_t output_dram_buffer_addr = output_dram_buffer.address(); + const uint32_t input_bank_id = 0; + const uint32_t output_bank_id = 0; + Sending real data into DRAM --------------------------- @@ -134,11 +137,9 @@ Setting runtime arguments for the data movement kernel const std::vector runtime_args = { l1_buffer.address(), input_dram_buffer.address(), - static_cast(input_dram_buffer.noc_coordinates().x), - static_cast(input_dram_buffer.noc_coordinates().y), + input_bank_id, output_dram_buffer.address(), - static_cast(output_dram_buffer.noc_coordinates().x), - static_cast(output_dram_buffer.noc_coordinates().y), + output_bank_id, l1_buffer.size() }; diff --git a/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst b/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst index 8749e64a4420..4a4bba21b9c9 100644 --- a/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst +++ b/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst @@ -100,8 +100,7 @@ Extra runtime arguments for reader/writer core, { dst_dram_buffer.address(), - static_cast(dst_dram_buffer.noc_coordinates().x), - static_cast(dst_dram_buffer.noc_coordinates().y), + dst_bank_id, num_tiles } ); diff --git a/tech_reports/prog_examples/add_2_integers_in_compute/Tutorial_Add_Two_Integers_in_a_Compute_Kernel.md b/tech_reports/prog_examples/add_2_integers_in_compute/Tutorial_Add_Two_Integers_in_a_Compute_Kernel.md index f580ed3f39dd..df489cf3107a 100644 --- a/tech_reports/prog_examples/add_2_integers_in_compute/Tutorial_Add_Two_Integers_in_a_Compute_Kernel.md +++ b/tech_reports/prog_examples/add_2_integers_in_compute/Tutorial_Add_Two_Integers_in_a_Compute_Kernel.md @@ -27,27 +27,18 @@ tt_metal::InterleavedBufferConfig dram_config{ .page_size = single_tile_size, .buffer_type = tt_metal::BufferType::DRAM }; +uint32_t src0_bank_id = 0; +uint32_t src1_bank_id = 0; +uint32_t dst_bank_id = 0; ``` -5. Define the tile size to fit BFloat16 values: +5. Allocate memory for each buffer: ```std::shared_ptr src0_dram_buffer = CreateBuffer(dram_config); std::shared_ptr src1_dram_buffer = CreateBuffer(dram_config); std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); ``` -6.Allocate memory for each buffer: -```auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates(); -auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates(); -auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates(); -uint32_t src0_dram_noc_x = src0_dram_noc_coord.x; -uint32_t src0_dram_noc_y = src0_dram_noc_coord.y; -uint32_t src1_dram_noc_x = src1_dram_noc_coord.x; -uint32_t src1_dram_noc_y = src1_dram_noc_coord.y; -uint32_t dst_dram_noc_x = dst_dram_noc_coord.x; -uint32_t dst_dram_noc_y = dst_dram_noc_coord.y; -``` - -7. Specify NoC Coordinates: +6. Create circular buffers and assign them to the program: ```constexpr uint32_t src0_cb_index = CB::c_in0; constexpr uint32_t num_input_tiles = 1; CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size); @@ -63,7 +54,7 @@ CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); ``` -8. Create a data movement kernel: +7. Create a data movement kernel: ```KernelHandle binary_reader_kernel_id = CreateKernel( program, "tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp", @@ -77,7 +68,7 @@ KernelHandle unary_writer_kernel_id = CreateKernel( DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); ``` -9. Create a compute kernel: +8. Create a compute kernel: ```vector compute_kernel_args = {}; KernelHandle eltwise_binary_kernel_id = CreateKernel( program, @@ -92,7 +83,7 @@ KernelHandle eltwise_binary_kernel_id = CreateKernel( ); ``` -10. Create two source vectors: +9. Create two source vectors: ```std::vector src0_vec; std::vector src1_vec; src0_vec = create_constant_vector_of_bfloat16(single_tile_size, 14.0f); @@ -102,8 +93,8 @@ EnqueueWriteBuffer(cq, src0_dram_buffer, src0_vec, false); EnqueueWriteBuffer(cq, src1_dram_buffer, src1_vec, false); ``` -11. Setup corresponding runtime arguments: -```SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_dram_noc_x, src0_dram_noc_y, src1_dram_noc_x, src1_dram_noc_y}); +10. Setup corresponding runtime arguments: +```SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_bank_id, src1_bank_id, dst_bank_id}); SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {}); SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_dram_noc_x, dst_dram_noc_y}); @@ -111,7 +102,7 @@ EnqueueProgram(cq, program, false); Finish(cq); ``` -12. Execute the Program: +11. Execute the Program: ```uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0); uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1); @@ -129,7 +120,7 @@ noc_async_read_barrier(); cb_push_back(cb_id_in1, 1); ``` -13. Unpack, compute, and pack the data: +12. Unpack, compute, and pack the data: ```binary_op_init_common(cb_in0, cb_in1, cb_out0); add_tiles_init(); @@ -153,8 +144,8 @@ cb_pop_front(cb_in1, 1); cb_push_back(cb_out0, 1); ``` -14. Write integer values to the DRAM: -```uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr); +13. Write integer values to the DRAM: +```uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram); constexpr uint32_t cb_id_out0 = tt::CB::c_out0; uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); @@ -166,6 +157,6 @@ noc_async_write_barrier(); cb_pop_front(cb_id_out0, 1); ``` -15. Close the device: +14. Close the device: ```CloseDevice(device); ``` diff --git a/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md b/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md index d92e7d5cd74e..0eb7ec005204 100644 --- a/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md +++ b/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md @@ -47,18 +47,12 @@ std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config Next, we allocate memory for each buffer with the specified configuration for each of the input vectors and another buffer for the output vector. The source data will be sent to the corresponding DRAM buffers to be accessed by the cores, and the results of the computation will be sent to the DRAM to be read by the destination vector. ``` cpp -auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates(); -auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates(); -auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates(); -uint32_t src0_dram_noc_x = src0_dram_noc_coord.x; -uint32_t src0_dram_noc_y = src0_dram_noc_coord.y; -uint32_t src1_dram_noc_x = src1_dram_noc_coord.x; -uint32_t src1_dram_noc_y = src1_dram_noc_coord.y; -uint32_t dst_dram_noc_x = dst_dram_noc_coord.x; -uint32_t dst_dram_noc_y = dst_dram_noc_coord.y; +uint32_t src0_bank_id = 0; +uint32_t src1_bank_id = 0; +uint32_t dst_bank_id = 0; ``` -For this example, we will also specify the NoC coordinates to pass into the kernel functions as runtime arguments. We will use this to ensure that the kernels will access the data at the correct NoC addresses. +For this example, we will also specify the Buffer Bank IDs to pass into the kernel functions as runtime arguments. We will use this to ensure that the kernels will access the data from the correct DRAM Memory Banks corresponding to each buffer. ``` cpp constexpr uint32_t src0_cb_index = CBIndex::c_0; @@ -129,9 +123,9 @@ EnqueueWriteBuffer(cq, src1_dram_buffer, src1_vec, false); Next, we create two source vectors, each loaded with a constant value, before queueing the command to feed it to the corresponding DRAM buffers using `EnqueueWriteBuffer`. ``` cpp -SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_dram_noc_x, src0_dram_noc_y, src1_dram_noc_x, src1_dram_noc_y}); +SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_bank_id, src1_bank_id}); SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {}); -SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_dram_noc_x, dst_dram_noc_y}); +SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id}); EnqueueProgram(cq, program, false); Finish(cq); @@ -192,7 +186,7 @@ In the compute kernel, a single tile is read from each of the circular buffers c ## Writer kernel function ``` cpp -uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr); +uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram); constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16; uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); diff --git a/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md b/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md index 52e5e556b1f1..bac6a4a9d161 100644 --- a/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md +++ b/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md @@ -88,7 +88,7 @@ In this example, we are using data movement processors for basic computation. As ## Configure and execute program ``` cpp -SetRuntimeArgs(program, binary_reader_kernel_id, core, {src0_dram_buffer->address(), src1_dram_buffer->address(), dst_dram_buffer->address(),}); +SetRuntimeArgs(program, binary_reader_kernel_id, core, {src0_dram_buffer->address(), src1_dram_buffer->address(), dst_dram_buffer->address(), src0_bank_id, src1_bank_id, dst_bank_id}); EnqueueProgram(cq, program, false); Finish(cq); @@ -100,9 +100,9 @@ In order to execute the program, we need to load the runtime arguments for the k ``` cpp // NoC coords (x,y) depending on DRAM location on-chip -uint64_t src0_dram_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src0_dram); -uint64_t src1_dram_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_dram); -uint64_t dst_dram_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_dram); +uint64_t src0_dram_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_dram); +uint64_t src1_dram_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_dram); +uint64_t dst_dram_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram); constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; // index=0 constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; // index=1 diff --git a/tech_reports/prog_examples/dram_loopback/dram_loopback.md b/tech_reports/prog_examples/dram_loopback/dram_loopback.md index 3a0486826db1..d7f64d73207f 100644 --- a/tech_reports/prog_examples/dram_loopback/dram_loopback.md +++ b/tech_reports/prog_examples/dram_loopback/dram_loopback.md @@ -110,11 +110,9 @@ We use a non-blocking call so we can continue setting up our program. const std::vector runtime_args = { l1_buffer.address(), input_dram_buffer.address(), - static_cast(input_dram_buffer.noc_coordinates().x), - static_cast(input_dram_buffer.noc_coordinates().y), + input_bank_id, output_dram_buffer.address(), - static_cast(output_dram_buffer.noc_coordinates().x), - static_cast(output_dram_buffer.noc_coordinates().y), + output_bank_id, l1_buffer.size() }; @@ -131,9 +129,9 @@ particular kernel, we have to provide: - Where the L1 buffer starts (memory address) - Where the input DRAM buffer starts (memory address) -- The location of the input DRAM buffer\'s channel on the NOC +- The Bank ID of the input DRAM buffer - Where the output DRAM buffer starts (memory address) -- The location of the output DRAM buffer\'s channel on the NOC +- The Bank ID of the output DRAM buffer - The size of the buffers ## Running the program diff --git a/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md b/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md index c7964729a86d..b8954ead4c04 100644 --- a/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md +++ b/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md @@ -77,8 +77,7 @@ SetRuntimeArgs( core, { dst_dram_buffer.address(), - static_cast(dst_dram_buffer.noc_coordinates().x), - static_cast(dst_dram_buffer.noc_coordinates().y), + dst_bank_id, num_tiles } ); diff --git a/tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp b/tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp index 872b02b215fe..673361837a71 100644 --- a/tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp +++ b/tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp @@ -33,7 +33,7 @@ void generate_bcast_scaler() { void kernel_main() { uint32_t src_addr = get_arg_val(0); uint32_t num_tiles = - get_arg_val(3); // same arg index as in reader_unary and in reader_unary_transpose_wh_8bank + get_arg_val(2); // same arg index as in reader_unary and in reader_unary_transpose_wh_8bank constexpr uint32_t cb_id_in0 = 0, cb_id_in1 = 1; diff --git a/tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp b/tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp index b34105a5a562..fb6169c91f17 100644 --- a/tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp +++ b/tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp @@ -6,10 +6,9 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); constexpr uint32_t cb_id_in0 = 0; @@ -18,8 +17,8 @@ void kernel_main() { uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles; // read a ublock of tiles from src to CB, and then push the ublock to unpacker - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t i = 0; i(bank_id, src_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id_in0); diff --git a/tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp b/tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp index 9dcf5c207991..1bc283a72db9 100644 --- a/tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp +++ b/tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp @@ -5,8 +5,8 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t num_tiles = get_arg_val(3); // Index 3 to match with regular writer_unary + uint32_t dst_addr = get_arg_val(0); + uint32_t num_tiles = get_arg_val(2); // Index 2 to match with regular writer_unary constexpr uint32_t cb_id_out0 = 16; constexpr uint32_t onetile = 1; diff --git a/tests/tt_eager/ops/test_sfpu.cpp b/tests/tt_eager/ops/test_sfpu.cpp index 46526fb313eb..1e9df15d4bee 100644 --- a/tests/tt_eager/ops/test_sfpu.cpp +++ b/tests/tt_eager/ops/test_sfpu.cpp @@ -94,9 +94,6 @@ bool run_sfpu_test(const string& sfpu_name, int tile_factor = 1, bool use_DRAM = auto dst_dram_buffer = CreateBuffer(buff_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math // kernel, input CB and reader @@ -167,8 +164,7 @@ bool run_sfpu_test(const string& sfpu_name, int tile_factor = 1, bool use_DRAM = core, { dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, + 0, num_tiles, 0, 0, @@ -181,7 +177,7 @@ bool run_sfpu_test(const string& sfpu_name, int tile_factor = 1, bool use_DRAM = program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + {dram_buffer_dst_addr, 0, num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp index 8c3321858408..05dbe4c2295c 100644 --- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp @@ -452,8 +452,7 @@ TEST_F(DeviceFixture, TensixTestDataCopyWithUpdatedCircularBufferConfig) { core, { (uint32_t)src_dram_buffer->address(), - (uint32_t)src_dram_buffer->noc_coordinates().x, - (uint32_t)src_dram_buffer->noc_coordinates().y, + 0, (uint32_t)num_tiles, }); SetRuntimeArgs( @@ -462,8 +461,7 @@ TEST_F(DeviceFixture, TensixTestDataCopyWithUpdatedCircularBufferConfig) { core, { (uint32_t)dst_dram_buffer->address(), - (uint32_t)dst_dram_buffer->noc_coordinates().x, - (uint32_t)dst_dram_buffer->noc_coordinates().y, + 0, (uint32_t)num_tiles, }); diff --git a/tests/tt_metal/tt_metal/api/test_banked.cpp b/tests/tt_metal/tt_metal/api/test_banked.cpp index 40de1459f03e..8f057e17dea7 100644 --- a/tests/tt_metal/tt_metal/api/test_banked.cpp +++ b/tests/tt_metal/tt_metal/api/test_banked.cpp @@ -82,17 +82,9 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked auto output_buffer = CreateBuffer(out_config); tt::log_debug( - tt::LogTest, - "Input buffer: [address: {} B, size: {} B] at noc coord {}", - input_buffer->address(), - input_buffer->size(), - input_buffer->noc_coordinates().str()); + tt::LogTest, "Input buffer: [address: {} B, size: {} B]", input_buffer->address(), input_buffer->size()); tt::log_debug( - tt::LogTest, - "Output buffer: [address: {} B, size: {} B] at noc coord {}", - output_buffer->address(), - output_buffer->size(), - output_buffer->noc_coordinates().str()); + tt::LogTest, "Output buffer: [address: {} B, size: {} B]", output_buffer->address(), output_buffer->size()); TT_FATAL(cfg.num_tiles * cfg.page_size_bytes == cfg.size_bytes, "Error"); constexpr uint32_t num_pages_cb = 1; @@ -103,6 +95,10 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM; bool output_is_dram = cfg.output_buffer_type == BufferType::DRAM; + std::map reader_defines = { + {"INTERFACE_WITH_L1", std::to_string((uint32_t)(not input_is_dram))}}; + std::map writer_defines = { + {"INTERFACE_WITH_L1", std::to_string((uint32_t)(not output_is_dram))}}; auto reader_kernel = CreateKernel( program, @@ -111,7 +107,8 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked DataMovementConfig{ .processor = DataMovementProcessor::RISCV_0, .noc = NOC::NOC_0, - .compile_args = {cb_id, uint32_t(input_buffer->page_size()), (uint32_t)input_is_dram}}); + .compile_args = {cb_id, uint32_t(input_buffer->page_size()), (uint32_t)input_is_dram}, + .defines = reader_defines}); auto writer_kernel = CreateKernel( program, writer_kernel_name, @@ -119,15 +116,15 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked DataMovementConfig{ .processor = DataMovementProcessor::RISCV_1, .noc = NOC::NOC_1, - .compile_args = {cb_id, uint32_t(output_buffer->page_size()), (uint32_t)output_is_dram}}); + .compile_args = {cb_id, uint32_t(output_buffer->page_size()), (uint32_t)output_is_dram}, + .defines = writer_defines}); if (banked_reader) { reader_runtime_args = {(uint32_t)input_buffer->address(), (uint32_t)cfg.num_tiles}; } else { reader_runtime_args = { (uint32_t)input_buffer->address(), - (uint32_t)input_buffer->noc_coordinates().x, - (uint32_t)input_buffer->noc_coordinates().y, + 0, (uint32_t)cfg.num_tiles, }; } @@ -136,8 +133,7 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked } else { writer_runtime_args = { (uint32_t)output_buffer->address(), - (uint32_t)output_buffer->noc_coordinates().x, - (uint32_t)output_buffer->noc_coordinates().y, + 0, (uint32_t)cfg.num_tiles, }; } diff --git a/tests/tt_metal/tt_metal/api/test_direct.cpp b/tests/tt_metal/tt_metal/api/test_direct.cpp index 9f7a3de8a866..91a299f5891f 100644 --- a/tests/tt_metal/tt_metal/api/test_direct.cpp +++ b/tests/tt_metal/tt_metal/api/test_direct.cpp @@ -39,7 +39,6 @@ bool reader_only( auto input_dram_buffer = CreateBuffer(dram_config); uint32_t dram_byte_address = input_dram_buffer->address(); - auto dram_noc_xy = input_dram_buffer->noc_coordinates(); // TODO (abhullar): Use L1 buffer after bug with L1 banking and writing to < 1 MB is fixed. // Try this after KM uplifts TLB setup // auto l1_buffer = @@ -65,8 +64,7 @@ bool reader_only( reader_core, { (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, + 0, (uint32_t)l1_byte_address, (uint32_t)byte_size, }); @@ -100,7 +98,6 @@ bool writer_only( auto output_dram_buffer = CreateBuffer(dram_config); uint32_t dram_byte_address = output_dram_buffer->address(); - auto dram_noc_xy = output_dram_buffer->noc_coordinates(); // TODO (abhullar): Use L1 buffer after bug with L1 banking and writing to < 1 MB is fixed. // Try this after KM uplifts TLB setup // auto l1_buffer = @@ -127,8 +124,7 @@ bool writer_only( writer_core, { (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, + 0, (uint32_t)l1_byte_address, (uint32_t)byte_size, }); @@ -169,10 +165,8 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf auto input_dram_buffer = CreateBuffer(dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_cb_config = tt_metal::CircularBufferConfig(byte_size, {{cb_index, test_config.l1_data_format}}) @@ -214,8 +208,7 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf test_config.core, { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }); tt_metal::SetRuntimeArgs( @@ -224,8 +217,7 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf test_config.core, { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }); @@ -262,10 +254,8 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM}; auto input_dram_buffer = tt_metal::CreateBuffer(dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); auto output_dram_buffer = tt_metal::CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(byte_size, {{input0_cb_index, test_config.l1_input_data_format}}) @@ -321,8 +311,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter test_config.core, { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }); tt_metal::SetRuntimeArgs( @@ -331,8 +320,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter test_config.core, { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }); diff --git a/tests/tt_metal/tt_metal/api/test_dram.cpp b/tests/tt_metal/tt_metal/api/test_dram.cpp index 99bba0b1b91d..293a10a5cafd 100644 --- a/tests/tt_metal/tt_metal/api/test_dram.cpp +++ b/tests/tt_metal/tt_metal/api/test_dram.cpp @@ -51,9 +51,6 @@ bool dram_single_core_db(DispatchFixture* fixture, tt_metal::Device* device) { auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - auto dram_copy_kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp", @@ -70,16 +67,14 @@ bool dram_single_core_db(DispatchFixture* fixture, tt_metal::Device* device) { dram_copy_kernel, core, {input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - dram_buffer_size_bytes, - num_tiles, - l1_buffer_addr, - total_l1_buffer_size_tiles, - total_l1_buffer_size_bytes}); + (std::uint32_t)0, + output_dram_buffer_addr, + (std::uint32_t)0, + dram_buffer_size_bytes, + num_tiles, + l1_buffer_addr, + total_l1_buffer_size_tiles, + total_l1_buffer_size_bytes}); fixture->RunProgram(device, program); @@ -105,25 +100,21 @@ bool dram_single_core( auto output_dram_buffer = tt_metal::CreateBuffer(dram_config); uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); log_debug(tt::LogVerif, "Creating kernel"); // Create the kernel auto dram_kernel = tt_metal::CreateKernel(program, cfg.kernel_file, cfg.core_range, cfg.data_movement_cfg); fixture->WriteBuffer(device, input_dram_buffer, src_vec); tt_metal::SetRuntimeArgs( - program, - dram_kernel, - cfg.core_range, - {cfg.l1_buffer_addr, - input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - cfg.dram_buffer_size}); + program, + dram_kernel, + cfg.core_range, + {cfg.l1_buffer_addr, + input_dram_buffer_addr, + (std::uint32_t)0, + output_dram_buffer_addr, + (std::uint32_t)0, + cfg.dram_buffer_size}); fixture->RunProgram(device, program); diff --git a/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp index fbb2ca02bab4..efa2bdd81af5 100644 --- a/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp @@ -48,7 +48,6 @@ bool dram_to_l1_multicast(DispatchFixture* fixture, tt_metal::Device* device, co auto dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_addr = dram_buffer->address(); - auto dram_noc_xy = dram_buffer->noc_coordinates(); CoreCoord core_start = {0, 0}; CoreCoord grid_size = device->logical_grid_size(); @@ -65,8 +64,7 @@ bool dram_to_l1_multicast(DispatchFixture* fixture, tt_metal::Device* device, co } std::vector mcast_reader_args = { (std::uint32_t)dram_buffer_addr, - (std::uint32_t)dram_noc_xy.x, - (std::uint32_t)dram_noc_xy.y, + 0, (std::uint32_t)dram_buffer_size, (std::uint32_t)local_buffer_addr, (std::uint32_t)dest_buffer_addr, diff --git a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp index 7417bdd13df1..13d38f4e2ef5 100644 --- a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp +++ b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp @@ -67,7 +67,10 @@ TEST_F(DispatchFixture, CreateMultipleGlobalSemaphoresOnSameCore) { const auto& cores_vec = cores_vecs[i]; for (const auto& core : cores_vec) { auto sem_vals = tt::llrt::read_hex_vec_from_core( - device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t)); + device->id(), + device->worker_core_from_logical_core(core), + address, + sizeof(uint32_t)); EXPECT_EQ(sem_vals[0], initial_value); } } diff --git a/tests/tt_metal/tt_metal/api/test_noc.cpp b/tests/tt_metal/tt_metal/api/test_noc.cpp index 8699430e54d3..5820e2710e0c 100644 --- a/tests/tt_metal/tt_metal/api/test_noc.cpp +++ b/tests/tt_metal/tt_metal/api/test_noc.cpp @@ -85,8 +85,8 @@ TEST(NOC, TensixSingleDeviceHarvestingPrints) { tt::log_info("Number of Harvested Rows={}", unharvested_logical_grid_size.y - logical_grid_size.y); } - tt::log_info("Logical -- Noc Coordinates Mapping"); - tt::log_info("[Logical <-> NOC0] Coordinates"); + tt::log_info("Logical -- Virtual Mapping"); + tt::log_info("[Logical <-> Virtual] Coordinates"); for (int r = 0; r < logical_grid_size.y; r++) { string output_row = ""; for (int c = 0; c < logical_grid_size.x; c++) { @@ -94,7 +94,7 @@ TEST(NOC, TensixSingleDeviceHarvestingPrints) { const auto noc_coord = device->worker_core_from_logical_core(logical_coord); output_row += "{L[x" + std::to_string(c); output_row += "-y" + std::to_string(r); - output_row += "]:N[x" + std::to_string(noc_coord.x); + output_row += "]:V[x" + std::to_string(noc_coord.x); output_row += "-y" + std::to_string(noc_coord.y); output_row += "]}, "; } @@ -108,6 +108,12 @@ TEST(NOC, TensixVerifyNocNodeIDs) { tt::tt_metal::Device* device; const unsigned int device_id = 0; device = tt::tt_metal::CreateDevice(device_id); + +#if COORDINATE_VIRTUALIZATION_ENABLED != 0 + uint32_t MY_NOC_ENCODING_REG = NOC_CFG(NOC_ID_LOGICAL); +#else + uint32_t MY_NOC_ENCODING_REG = NOC_NODE_ID; +#endif // Ping all the Noc Nodes auto logical_grid_size = device->logical_grid_size(); for (size_t y = 0; y < logical_grid_size.y; y++) { @@ -115,7 +121,7 @@ TEST(NOC, TensixVerifyNocNodeIDs) { auto worker_core = device->worker_core_from_logical_core(CoreCoord(x, y)); // Read register from specific node uint32_t node_id_regval; - node_id_regval = unit_tests::basic::test_noc::read_reg(device, CoreCoord(x, y), NOC_NODE_ID); + node_id_regval = unit_tests::basic::test_noc::read_reg(device, CoreCoord(x, y), MY_NOC_ENCODING_REG); ASSERT_NE( node_id_regval, unit_tests::basic::test_noc::init_value); // Need to make sure we read in valid reg // Check it matches software translated xy diff --git a/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp index ce128a890012..881efb23d1e4 100644 --- a/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp +++ b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp @@ -62,23 +62,26 @@ bool SimpleTiledL1WriteCBRead( tt_metal::CircularBufferConfig(byte_size, {{cb_index, tt::DataFormat::Float16_b}}) .set_page_size(cb_index, page_size); auto l1_cb = tt_metal::CreateCircularBuffer(program, core, l1_cb_config); - + std::map defines = {{"INTERFACE_WITH_L1", "1"}}; + uint32_t bank_id = device->bank_ids_from_logical_core(BufferType::L1, core)[0]; auto reader_kernel = tt_metal::CreateKernel( program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::NOC_0, - .compile_args = {cb_index}}); + .compile_args = {cb_index}, + .defines = defines}); auto writer_kernel = tt_metal::CreateKernel( program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::NOC_1, - .compile_args = {cb_index}}); + .compile_args = {cb_index}, + .defines = defines}); tt_metal::SetRuntimeArgs( program, @@ -86,8 +89,7 @@ bool SimpleTiledL1WriteCBRead( core, { (uint32_t)input_local_address, - (uint32_t)phys_core.x, - (uint32_t)phys_core.y, + bank_id, (uint32_t)num_tiles, }); tt_metal::SetRuntimeArgs( @@ -96,8 +98,7 @@ bool SimpleTiledL1WriteCBRead( core, { (uint32_t)output_local_address, - (uint32_t)phys_core.x, - (uint32_t)phys_core.y, + bank_id, (uint32_t)num_tiles, }); @@ -114,6 +115,7 @@ bool SimpleTiledL1WriteCBRead( } return pass; } + } // namespace tt::test::buffer::detail TEST_F(DeviceFixture, TestSimpleL1BufferReadOnlyLo) { diff --git a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp index f24d3fee91f0..dcd236cd74a7 100644 --- a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp +++ b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp @@ -52,6 +52,7 @@ TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) { for (int device_id = 0; device_id < num_devices; device_id++) { tt_metal::Device* device = tt_metal::CreateDevice(device_id); uint32_t harvested_rows_mask = tt::Cluster::instance().get_harvested_rows(device_id); + const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); log_info(LogTest, "Device {} harvesting mask {}", device_id, harvested_rows_mask); std::unordered_set harvested_rows = unit_tests::basic::soc_desc::get_harvested_rows(device_id); @@ -59,7 +60,7 @@ TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) { for (int x = 0; x < logical_grid_size.x; x++) { for (int y = 0; y < logical_grid_size.y; y++) { CoreCoord logical_core_coord(x, y); - CoreCoord physical_core_coord = device->worker_core_from_logical_core(logical_core_coord); + CoreCoord physical_core_coord = soc_desc.get_physical_tensix_core_from_logical(logical_core_coord); ASSERT_TRUE(harvested_rows.find(physical_core_coord.y) == harvested_rows.end()); } } diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp index 7d8ec61dd309..ad0ef0534e93 100644 --- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp @@ -44,8 +44,7 @@ using DramBuffer = std::shared_ptr; static std::vector get_dram_kernel_runtime_arguments(const DramBuffer& dram_buffer, size_t num_tiles) { return { static_cast(dram_buffer->address()), - static_cast(dram_buffer->noc_coordinates().x), - static_cast(dram_buffer->noc_coordinates().y), + static_cast(0), static_cast(num_tiles), }; } diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp index 33b858331570..e324a6f44bc3 100644 --- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp @@ -120,7 +120,6 @@ static void RunTest(DPrintFixture* fixture, Device* device, tt::DataFormat data_ .device = device, .size = tile_size, .page_size = tile_size, .buffer_type = tt_metal::BufferType::DRAM}; auto src_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); // Create kernels on device KernelHandle brisc_print_kernel_id = CreateKernel( @@ -143,11 +142,7 @@ static void RunTest(DPrintFixture* fixture, Device* device, tt::DataFormat data_ ); // BRISC kernel needs dram info via rtargs - tt_metal::SetRuntimeArgs( - program, - brisc_print_kernel_id, - core, - {dram_buffer_src_addr, (std::uint32_t)dram_src_noc_xy.x, (std::uint32_t)dram_src_noc_xy.y}); + tt_metal::SetRuntimeArgs(program, brisc_print_kernel_id, core, {dram_buffer_src_addr, (std::uint32_t)0}); // Create input tile std::vector u32_vec = GenerateInputTile(data_format); diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp index 25fd8be5c26d..ddd80f0a95dd 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp @@ -155,18 +155,19 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty // We should be able to find the expected watcher error in the log as well, // expected error message depends on the risc we're running on. string kernel = "tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp"; - int line_num = 56; + int line_num = 57; string expected = fmt::format( - "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} tripped an assert on line {}. Current kernel: {}.", + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} tripped an assert on line {}. Current kernel: {}.", device->id(), (riscv_type == DebugErisc) ? "ethnet" : "worker", - logical_core.x, logical_core.y, - phys_core.x, phys_core.y, + logical_core.x, + logical_core.y, + phys_core.x, + phys_core.y, risc, line_num, - kernel - ); + kernel); expected += " Note that file name reporting is not yet implemented, and the reported line number for the assert may be from a different file."; log_info(LogTest, "Expected error: {}", expected); @@ -179,7 +180,7 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty } } -TEST_F(WatcherFixture, TensixTestWatcherAssertBrisc) { +TEST_F(WatcherFixture, TestWatcherAssertBrisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -191,7 +192,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertBrisc) { ); } -TEST_F(WatcherFixture, TensixTestWatcherAssertNCrisc) { +TEST_F(WatcherFixture, TestWatcherAssertNCrisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -201,7 +202,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertNCrisc) { ); } -TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc0) { +TEST_F(WatcherFixture, TestWatcherAssertTrisc0) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -211,7 +212,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc0) { ); } -TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc1) { +TEST_F(WatcherFixture, TestWatcherAssertTrisc1) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -221,7 +222,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc1) { ); } -TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc2) { +TEST_F(WatcherFixture, TestWatcherAssertTrisc2) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -231,7 +232,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc2) { ); } -TEST_F(WatcherFixture, ActiveEthTestWatcherAssertErisc) { +TEST_F(WatcherFixture, TestWatcherAssertErisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -241,7 +242,7 @@ TEST_F(WatcherFixture, ActiveEthTestWatcherAssertErisc) { ); } -TEST_F(WatcherFixture, IdleEthTestWatcherAssertIErisc) { +TEST_F(WatcherFixture, TestWatcherAssertIErisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (!this->IsSlowDispatch()) { log_info(tt::LogTest, "FD-on-idle-eth not supported."); diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp index 8f656da7fd62..0ac4f6ce2670 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp @@ -21,8 +21,8 @@ using namespace tt::tt_metal; typedef enum sanitization_features { SanitizeAddress, - SanitizeAlignmentL1, - SanitizeAlignmentDRAM + SanitizeAlignmentL1Write, + SanitizeAlignmentL1Read } watcher_features_t; void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, bool is_eth_core, watcher_features_t feature, bool use_ncrisc = false) { @@ -38,94 +38,83 @@ void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, boo // Set up dram buffers uint32_t single_tile_size = 2 * 1024; uint32_t num_tiles = 50; - uint32_t dram_buffer_size = single_tile_size * num_tiles; + uint32_t l1_buffer_size = single_tile_size * num_tiles; uint32_t l1_buffer_addr = 400 * 1024; + tt_metal::InterleavedBufferConfig l1_config{ + .device = device, .size = l1_buffer_size, .page_size = l1_buffer_size, .buffer_type = tt_metal::BufferType::L1}; + auto input_l1_buffer = CreateBuffer(l1_config); + uint32_t input_l1_buffer_addr = input_l1_buffer->address(); - tt_metal::InterleavedBufferConfig dram_config{ - .device=device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = tt_metal::BufferType::DRAM - }; - auto input_dram_buffer = CreateBuffer(dram_config); - uint32_t input_dram_buffer_addr = input_dram_buffer->address(); + auto output_l1_buffer = CreateBuffer(l1_config); + uint32_t output_l1_buffer_addr = output_l1_buffer->address(); - auto output_dram_buffer = CreateBuffer(dram_config); - uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - log_info("Input DRAM: {}", input_dram_noc_xy); - log_info("Output DRAM: {}", output_dram_noc_xy); + auto input_buf_noc_xy = device->worker_core_from_logical_core(input_l1_buffer->logical_core_from_bank_id(0)); + auto output_buf_noc_xy = device->worker_core_from_logical_core(output_l1_buffer->logical_core_from_bank_id(0)); + log_info("Input DRAM: {}", input_buf_noc_xy); + log_info("Output DRAM: {}", output_buf_noc_xy); // A DRAM copy kernel, we'll feed it incorrect inputs to test sanitization. KernelHandle dram_copy_kernel; if (is_eth_core) { std::map dram_copy_kernel_defines = { - {"SIGNAL_COMPLETION_TO_DISPATCHER", "1"}, - }; - dram_copy_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", - core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, - .defines=dram_copy_kernel_defines - } - ); + {"SIGNAL_COMPLETION_TO_DISPATCHER", "1"}, + }; + dram_copy_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp", + core, + tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .defines = dram_copy_kernel_defines}); } else { - std::map dram_copy_kernel_defines = { - {"SIGNAL_COMPLETION_TO_DISPATCHER", "1"}, - }; - dram_copy_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", - core, - tt_metal::DataMovementConfig{ - .processor = (use_ncrisc) ? tt_metal::DataMovementProcessor::RISCV_1 : tt_metal::DataMovementProcessor::RISCV_0, - .noc = (use_ncrisc) ? tt_metal::NOC::RISCV_1_default : tt_metal::NOC::RISCV_0_default, - .defines=dram_copy_kernel_defines - } - ); + std::map dram_copy_kernel_defines = { + {"SIGNAL_COMPLETION_TO_DISPATCHER", "1"}, + }; + dram_copy_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp", + core, + tt_metal::DataMovementConfig{ + .processor = + (use_ncrisc) ? tt_metal::DataMovementProcessor::RISCV_1 : tt_metal::DataMovementProcessor::RISCV_0, + .noc = (use_ncrisc) ? tt_metal::NOC::RISCV_1_default : tt_metal::NOC::RISCV_0_default, + .defines = dram_copy_kernel_defines}); } // Write to the input DRAM buffer std::vector input_vec = create_random_vector_of_bfloat16( - dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); - tt_metal::detail::WriteToBuffer(input_dram_buffer, input_vec); + l1_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); + tt_metal::detail::WriteToBuffer(input_l1_buffer, input_vec); // Write runtime args - update to a core that doesn't exist or an improperly aligned address, // depending on the flags passed in. switch(feature) { case SanitizeAddress: - output_dram_noc_xy.x = 16; - output_dram_noc_xy.y = 16; - break; - case SanitizeAlignmentL1: - l1_buffer_addr += 16; // This is illegal because reading DRAM->L1 needs DRAM alignment - // requirements (32 byte aligned). + output_buf_noc_xy.x = 16; + output_buf_noc_xy.y = 16; break; - case SanitizeAlignmentDRAM: - input_dram_buffer_addr++; + case SanitizeAlignmentL1Write: + output_l1_buffer_addr++; // This is illegal because reading DRAM->L1 needs DRAM alignment + // requirements (32 byte aligned). break; + case SanitizeAlignmentL1Read: input_l1_buffer_addr++; break; default: log_warning(LogTest, "Unrecognized feature to test ({}), skipping...", feature); GTEST_SKIP(); break; } + tt_metal::SetRuntimeArgs( program, dram_copy_kernel, core, {l1_buffer_addr, - input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - dram_buffer_size}); + input_l1_buffer_addr, + input_buf_noc_xy.x, + input_buf_noc_xy.y, + output_l1_buffer_addr, + (std::uint32_t)output_buf_noc_xy.x, + (std::uint32_t)output_buf_noc_xy.y, + l1_buffer_size}); // Run the kernel, expect an exception here try { @@ -140,40 +129,64 @@ void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, boo // We should be able to find the expected watcher error in the log as well. string expected; + int noc = (use_ncrisc) ? 1 : 0; + CoreCoord target_core = device->virtual_noc_coordinate(noc, input_buf_noc_xy); + string risc_name = (is_eth_core) ? "erisc" : "brisc"; + if (use_ncrisc) { + risc_name = "ncrisc"; + } switch(feature) { case SanitizeAddress: expected = fmt::format( - "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} using noc0 tried to unicast write 102400 bytes from local L1[{:#08x}] to Unknown core w/ physical coords {} [addr=0x{:08x}] (NOC target address did not map to any known Tensix/Ethernet/DRAM/PCIE core).", + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} using noc0 tried to unicast write 102400 " + "bytes from local L1[{:#08x}] to Unknown core w/ physical coords {} [addr=0x{:08x}] (NOC target " + "address did not map to any known Tensix/Ethernet/DRAM/PCIE core).", device->id(), (is_eth_core) ? "ethnet" : "worker", - core.x, core.y, phys_core.x, phys_core.y, - (is_eth_core) ? "erisc" : "brisc", l1_buffer_addr, output_dram_noc_xy.str(), - output_dram_buffer_addr - ); + core.x, + core.y, + phys_core.x, + phys_core.y, + (is_eth_core) ? "erisc" : "brisc", + l1_buffer_addr, + output_buf_noc_xy.str(), + output_l1_buffer_addr); break; - case SanitizeAlignmentL1: - case SanitizeAlignmentDRAM: - { - // NoC-1 has a different coordinate for the same DRAM - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - int noc = (use_ncrisc) ? 1 : 0; - CoreCoord target_phys_core = { - tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, input_dram_noc_xy.x), - tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, input_dram_noc_xy.y) - }; - string risc_name = (is_eth_core) ? "erisc" : "brisc"; - if (use_ncrisc) - risc_name = "ncrisc"; + case SanitizeAlignmentL1Write: { expected = fmt::format( - "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} using noc{} tried to unicast read 102400 bytes to local L1[{:#08x}] from DRAM core w/ physical coords {} DRAM[addr=0x{:08x}] (invalid address alignment in NOC transaction).", + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} using noc{} tried to unicast write 102400 " + "bytes from local L1[{:#08x}] to Tensix core w/ physical coords {} L1[addr=0x{:08x}] (invalid address " + "alignment in NOC transaction).", device->id(), (is_eth_core) ? "ethnet" : "worker", - core.x, core.y, phys_core.x, phys_core.y, - risc_name, noc, l1_buffer_addr, target_phys_core, - input_dram_buffer_addr - ); - } + core.x, + core.y, + phys_core.x, + phys_core.y, + risc_name, + noc, + l1_buffer_addr, + target_core, + output_l1_buffer_addr); break; + } + case SanitizeAlignmentL1Read: { + expected = fmt::format( + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} using noc{} tried to unicast read 102400 " + "bytes to local L1[{:#08x}] from Tensix core w/ physical coords {} L1[addr=0x{:08x}] (invalid address " + "alignment in NOC transaction).", + device->id(), + (is_eth_core) ? "ethnet" : "worker", + core.x, + core.y, + phys_core.x, + phys_core.y, + risc_name, + noc, + l1_buffer_addr, + target_core, + input_l1_buffer_addr); + } break; default: log_warning(LogTest, "Unrecognized feature to test ({}), skipping...", feature); GTEST_SKIP(); @@ -244,37 +257,37 @@ TEST_F(WatcherFixture, TensixTestWatcherSanitize) { ); } -TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1Write) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){ CoreCoord core{0, 0}; - RunTestOnCore(fixture, device, core, false, SanitizeAlignmentL1); + RunTestOnCore(fixture, device, core, false, SanitizeAlignmentL1Write); }, this->devices_[0] ); } -TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentDRAM) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1Read) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){ CoreCoord core{0, 0}; - RunTestOnCore(fixture, device, core, false, SanitizeAlignmentDRAM); + RunTestOnCore(fixture, device, core, false, SanitizeAlignmentL1Read); }, this->devices_[0] ); } -TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentDRAMNCrisc) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1ReadNCrisc) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){ CoreCoord core{0, 0}; - RunTestOnCore(fixture, device, core, false, SanitizeAlignmentDRAM, true); + RunTestOnCore(fixture, device, core, false, SanitizeAlignmentL1Read, true); }, this->devices_[0] ); diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp index 3ddd22c58cc2..67054c8c604e 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp @@ -52,10 +52,6 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = tt::CBIndex::c_0; uint32_t num_input_tiles = 2; tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * SINGLE_TILE_SIZE, {{src0_cb_index, tt::DataFormat::Float16_b}}) @@ -118,19 +114,9 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord EnqueueWriteBuffer(cq, std::ref(src1_dram_buffer), src1_vec, false); vector reader_args = { - dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - NUM_TILES, - dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - NUM_TILES, - 0}; - - vector writer_args = { - dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, NUM_TILES - }; + dram_buffer_src0_addr, (std::uint32_t)0, NUM_TILES, dram_buffer_src1_addr, (std::uint32_t)0, NUM_TILES, 0}; + + vector writer_args = {dram_buffer_dst_addr, (std::uint32_t)0, NUM_TILES}; SetRuntimeArgs(program, unary_writer_kernel, core, writer_args); SetRuntimeArgs(program, binary_reader_kernel, core, reader_args); @@ -143,7 +129,7 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord std::vector read_vec; CoreCoord worker_core = fixture->delayed_cores[CoreType::WORKER][0]; // Just check that the first delayed core has the feedback set - CoreCoord phys_core = device->physical_core_from_logical_core({0,0}, CoreType::WORKER); + CoreCoord phys_core = device->virtual_core_from_logical_core({0, 0}, CoreType::WORKER); read_vec = tt::llrt::read_hex_vec_from_core ( device->id(), phys_core, diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp index fb70bc917003..46d0c1666388 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp @@ -65,7 +65,8 @@ static void RunTest(WatcherFixture* fixture, Device* device) { KernelHandle erisc_kid; std::set eth_core_ranges; for (const auto& core : device->get_active_ethernet_cores(true)) { - log_info(LogTest, "Running on eth core {}({})", core.str(), device->ethernet_core_from_logical_core(core).str()); + log_info( + LogTest, "Running on eth core {}({})", core.str(), device->ethernet_core_from_logical_core(core).str()); eth_core_ranges.insert(CoreRange(core, core)); } erisc_kid = CreateKernel( @@ -82,7 +83,11 @@ static void RunTest(WatcherFixture* fixture, Device* device) { KernelHandle ierisc_kid; std::set eth_core_ranges; for (const auto& core : device->get_inactive_ethernet_cores()) { - log_info(LogTest, "Running on inactive eth core {}({})", core.str(), device->ethernet_core_from_logical_core(core).str()); + log_info( + LogTest, + "Running on inactive eth core {}({})", + core.str(), + device->ethernet_core_from_logical_core(core).str()); eth_core_ranges.insert(CoreRange(core, core)); } ierisc_kid = CreateKernel( diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp index 97ed9adef752..c2e63a7e384a 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp @@ -144,7 +144,7 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferBrisc) { +TEST_F(WatcherFixture, TestWatcherRingBufferBrisc) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -153,7 +153,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferBrisc) { ); } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferNCrisc) { + +TEST_F(WatcherFixture, TestWatcherRingBufferNCrisc) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -162,7 +163,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferNCrisc) { ); } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc0) { + +TEST_F(WatcherFixture, TestWatcherRingBufferTrisc0) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -171,7 +173,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc0) { ); } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc1) { + +TEST_F(WatcherFixture, TestWatcherRingBufferTrisc1) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -180,7 +183,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc1) { ); } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc2) { + +TEST_F(WatcherFixture, TestWatcherRingBufferTrisc2) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -189,7 +193,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc2) { ); } } -TEST_F(WatcherFixture, ActiveEthTestWatcherRingBufferErisc) { + +TEST_F(WatcherFixture, TestWatcherRingBufferErisc) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -198,7 +203,8 @@ TEST_F(WatcherFixture, ActiveEthTestWatcherRingBufferErisc) { ); } } -TEST_F(WatcherFixture, IdleEthTestWatcherRingBufferIErisc) { + +TEST_F(WatcherFixture, TestWatcherRingBufferIErisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (!this->IsSlowDispatch()) { log_info(tt::LogTest, "FD-on-idle-eth not supported."); diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp index 524d06c4f3e5..f411c0da6a36 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp @@ -145,7 +145,8 @@ static void RunTest(WatcherFixture* fixture, Device* device) { k_id_s = ""; } expected = fmt::format( - "Device {} ethnet core(x={:2},y={:2}) phys(x={:2},y={:2}): {}, X, X, X, X rmsg:* h_id:0 " + "Device {} ethnet core(x={:2},y={:2}) virtual(x={:2},y={:2}): {}, X, X, X, X rmsg:* " + "h_id:0 " "k_id:{}", device->id(), logical_core.x, @@ -165,7 +166,7 @@ static void RunTest(WatcherFixture* fixture, Device* device) { k_id_s = ""; } expected = fmt::format( - "Device {} worker core(x={:2},y={:2}) phys(x={:2},y={:2}): {},{},{},{},{} rmsg:***|*** h_id:0 " + "Device {} worker core(x={:2},y={:2}) virtual(x={:2},y={:2}): {},{},{},{},{} rmsg:***|*** h_id:0 " "smsg:**** k_ids:{}", device->id(), logical_core.x, diff --git a/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp index 5968e1441386..0b9ba68dddca 100644 --- a/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp @@ -45,7 +45,7 @@ TEST_F(N300DeviceFixture, EthValidateEthernetConnectivity) { } // Check conversion to noc coords - std::vector chip_0_eth_noc_coords_expected = {CoreCoord(9, 6), CoreCoord(1, 6)}; + std::vector chip_0_eth_noc_coords_expected = {CoreCoord(25, 17), CoreCoord(18, 17)}; std::vector chip_0_eth_logical_coords; std::copy( @@ -59,7 +59,7 @@ TEST_F(N300DeviceFixture, EthValidateEthernetConnectivity) { std::sort(chip_0_eth_noc_coords_returned.begin(), chip_0_eth_noc_coords_returned.end()); ASSERT_TRUE(chip_0_eth_noc_coords_returned == chip_0_eth_noc_coords_expected); - std::vector chip_1_eth_noc_coords_expected = {CoreCoord(9, 0), CoreCoord(1, 0)}; + std::vector chip_1_eth_noc_coords_expected = {CoreCoord(25, 16), CoreCoord(18, 16)}; std::vector chip_1_eth_logical_coords; std::copy( @@ -82,22 +82,22 @@ TEST_F(N300DeviceFixture, EthInvalidLogicalEthernetCore) { TEST_F(N300DeviceFixture, EthValidateAllEthernetCoreMapping) { static std::map expected_mapping_logical_to_physical = { - {CoreCoord(0, 0), CoreCoord(9, 0)}, - {CoreCoord(0, 1), CoreCoord(1, 0)}, - {CoreCoord(0, 2), CoreCoord(8, 0)}, - {CoreCoord(0, 3), CoreCoord(2, 0)}, - {CoreCoord(0, 4), CoreCoord(7, 0)}, - {CoreCoord(0, 5), CoreCoord(3, 0)}, - {CoreCoord(0, 6), CoreCoord(6, 0)}, - {CoreCoord(0, 7), CoreCoord(4, 0)}, - {CoreCoord(0, 8), CoreCoord(9, 6)}, - {CoreCoord(0, 9), CoreCoord(1, 6)}, - {CoreCoord(0, 10), CoreCoord(8, 6)}, - {CoreCoord(0, 11), CoreCoord(2, 6)}, - {CoreCoord(0, 12), CoreCoord(7, 6)}, - {CoreCoord(0, 13), CoreCoord(3, 6)}, - {CoreCoord(0, 14), CoreCoord(6, 6)}, - {CoreCoord(0, 15), CoreCoord(4, 6)}, + {CoreCoord(0, 0), CoreCoord(25, 16)}, + {CoreCoord(0, 1), CoreCoord(18, 16)}, + {CoreCoord(0, 2), CoreCoord(24, 16)}, + {CoreCoord(0, 3), CoreCoord(19, 16)}, + {CoreCoord(0, 4), CoreCoord(23, 16)}, + {CoreCoord(0, 5), CoreCoord(20, 16)}, + {CoreCoord(0, 6), CoreCoord(22, 16)}, + {CoreCoord(0, 7), CoreCoord(21, 16)}, + {CoreCoord(0, 8), CoreCoord(25, 17)}, + {CoreCoord(0, 9), CoreCoord(18, 17)}, + {CoreCoord(0, 10), CoreCoord(24, 17)}, + {CoreCoord(0, 11), CoreCoord(19, 17)}, + {CoreCoord(0, 12), CoreCoord(23, 17)}, + {CoreCoord(0, 13), CoreCoord(20, 17)}, + {CoreCoord(0, 14), CoreCoord(22, 17)}, + {CoreCoord(0, 15), CoreCoord(21, 17)}, }; const auto& device_0 = this->devices_.at(0); for (const auto& logical_core : device_0->ethernet_cores()) { @@ -109,31 +109,31 @@ TEST_F(N300DeviceFixture, EthValidateAllEthernetCoreMapping) { TEST_F(N300DeviceFixture, EthValidatePhysicalCoreConversion) { static std::map expected_mapping_logical_to_physical = { - {CoreCoord(0, 0), CoreCoord(9, 0)}, - {CoreCoord(0, 1), CoreCoord(1, 0)}, - {CoreCoord(0, 2), CoreCoord(8, 0)}, - {CoreCoord(0, 3), CoreCoord(2, 0)}, - {CoreCoord(0, 4), CoreCoord(7, 0)}, - {CoreCoord(0, 5), CoreCoord(3, 0)}, - {CoreCoord(0, 6), CoreCoord(6, 0)}, - {CoreCoord(0, 7), CoreCoord(4, 0)}, - {CoreCoord(0, 8), CoreCoord(9, 6)}, - {CoreCoord(0, 9), CoreCoord(1, 6)}, - {CoreCoord(0, 10), CoreCoord(8, 6)}, - {CoreCoord(0, 11), CoreCoord(2, 6)}, - {CoreCoord(0, 12), CoreCoord(7, 6)}, - {CoreCoord(0, 13), CoreCoord(3, 6)}, - {CoreCoord(0, 14), CoreCoord(6, 6)}, - {CoreCoord(0, 15), CoreCoord(4, 6)}, + {CoreCoord(0, 0), CoreCoord(25, 16)}, + {CoreCoord(0, 1), CoreCoord(18, 16)}, + {CoreCoord(0, 2), CoreCoord(24, 16)}, + {CoreCoord(0, 3), CoreCoord(19, 16)}, + {CoreCoord(0, 4), CoreCoord(23, 16)}, + {CoreCoord(0, 5), CoreCoord(20, 16)}, + {CoreCoord(0, 6), CoreCoord(22, 16)}, + {CoreCoord(0, 7), CoreCoord(21, 16)}, + {CoreCoord(0, 8), CoreCoord(25, 17)}, + {CoreCoord(0, 9), CoreCoord(18, 17)}, + {CoreCoord(0, 10), CoreCoord(24, 17)}, + {CoreCoord(0, 11), CoreCoord(19, 17)}, + {CoreCoord(0, 12), CoreCoord(23, 17)}, + {CoreCoord(0, 13), CoreCoord(20, 17)}, + {CoreCoord(0, 14), CoreCoord(22, 17)}, + {CoreCoord(0, 15), CoreCoord(21, 17)}, }; const auto& device_0 = this->devices_.at(0); for (const auto& logical_core : device_0->ethernet_cores()) { ASSERT_TRUE( - device_0->physical_core_from_logical_core(logical_core, CoreType::ETH) == + device_0->virtual_core_from_logical_core(logical_core, CoreType::ETH) == expected_mapping_logical_to_physical.at(logical_core)); } // Check an invalid core type - EXPECT_ANY_THROW(device_0->physical_core_from_logical_core(CoreCoord(0, 0), CoreType::PCIE)); + EXPECT_ANY_THROW(device_0->virtual_core_from_logical_core(CoreCoord(0, 0), CoreType::PCIE)); } TEST_F(N300DeviceFixture, ActiveEthValidateEthernetSockets) { diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp index 5dd7eea0042f..81b4a647c72b 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp @@ -598,19 +598,10 @@ bool verify_rt_args( tt::Cluster::instance().l1_barrier(device->id()); auto noc_xy = riscv == tt::RISCV::ERISC ? device->ethernet_core_from_logical_core(logical_core) : device->worker_core_from_logical_core(logical_core); - std::vector args_readback = - tt::llrt::read_hex_vec_from_core(device->id(), noc_xy, addr, expected_rt_args.size() * sizeof(uint32_t)); - log_debug( - tt::LogTest, - "Verifying {} {} RT args for {} (Logical: {}) at addr: 0x{:x} w/ incr_val: {}", - expected_rt_args.size(), - label, - noc_xy, - logical_core.str(), - addr, - incr_val); - - for (int i = 0; i < expected_rt_args.size(); i++) { + std::vector args_readback = tt::llrt::read_hex_vec_from_core(device->id(), noc_xy, addr, expected_rt_args.size() * sizeof(uint32_t)); + log_debug(tt::LogTest, "Verifying {} {} RT args for {} (Logical: {}) at addr: 0x{:x} w/ incr_val: {}", expected_rt_args.size(), label, noc_xy, logical_core.str(), addr, incr_val); + + for(int i=0; iphysical_core_from_logical_core(eth_core, CoreType::ETH); + CoreCoord phys_eth_core = device->virtual_core_from_logical_core(eth_core, CoreType::ETH); uint32_t eth_sem_id = CreateSemaphore(program, eth_core, eth_sem_init_val, CoreType::ETH); auto eth_kernel = CreateKernel( program, @@ -102,7 +102,7 @@ TEST_F(DispatchFixture, EthTestBlank) { if (eth_cores.size() > 0) { CoreCoord eth_core = *eth_cores.begin(); - CoreCoord phys_eth_core = device->physical_core_from_logical_core(eth_core, CoreType::ETH); + CoreCoord phys_eth_core = device->virtual_core_from_logical_core(eth_core, CoreType::ETH); CreateKernel( program, "tt_metal/kernels/dataflow/blank.cpp", @@ -156,7 +156,7 @@ TEST_F(DispatchFixture, EthTestInitLocalMemory) { if (eth_cores.size() > 0) { CoreCoord eth_core = *eth_cores.begin(); - CoreCoord phys_eth_core = device->physical_core_from_logical_core(eth_core, CoreType::ETH); + CoreCoord phys_eth_core = device->virtual_core_from_logical_core(eth_core, CoreType::ETH); CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/misc/local_mem.cpp", diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp index fcac08d6d488..a13fa760e2a9 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp @@ -57,10 +57,16 @@ Program create_simple_unary_program(Buffer& input, Buffer& output) { std::shared_ptr reader_runtime_args = std::make_shared(); *writer_runtime_args = { - &output, (uint32_t)output.noc_coordinates().x, (uint32_t)output.noc_coordinates().y, output.num_pages()}; + &output, + (uint32_t)0, + output.num_pages() + }; *reader_runtime_args = { - &input, (uint32_t)input.noc_coordinates().x, (uint32_t)input.noc_coordinates().y, input.num_pages()}; + &input, + (uint32_t)0, + input.num_pages() + }; SetRuntimeArgs(device, detail::GetKernel(program, writer_kernel), worker, writer_runtime_args); SetRuntimeArgs(device, detail::GetKernel(program, reader_kernel), worker, reader_runtime_args); diff --git a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp index 48ac0685e61b..87abacc67598 100644 --- a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp +++ b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp @@ -57,14 +57,12 @@ bool reader_kernel_no_send( auto input_dram_buffer = CreateBuffer(dram_config); uint32_t dram_byte_address = input_dram_buffer->address(); - auto dram_noc_xy = input_dram_buffer->noc_coordinates(); auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core); log_debug( tt::LogTest, - "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", + "Device {}: reading {} bytes from dram bank 0 addr {} to ethernet core {} addr {}", device->id(), byte_size, - dram_noc_xy.str(), dram_byte_address, eth_reader_core.str(), eth_l1_byte_address); @@ -92,8 +90,7 @@ bool reader_kernel_no_send( eth_reader_core, { (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, + 0, (uint32_t)byte_size, (uint32_t)eth_l1_byte_address, }); @@ -126,16 +123,14 @@ bool writer_kernel_no_receive( auto output_dram_buffer = CreateBuffer(dram_config); uint32_t dram_byte_address = output_dram_buffer->address(); - auto dram_noc_xy = output_dram_buffer->noc_coordinates(); auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core); log_debug( tt::LogTest, - "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", + "Device {}: writing {} bytes from ethernet core {} addr {} to dram bank 0 addr {}", device->id(), byte_size, eth_writer_core.str(), eth_l1_byte_address, - dram_noc_xy.str(), dram_byte_address); auto eth_writer_kernel = tt_metal::CreateKernel( @@ -161,18 +156,18 @@ bool writer_kernel_no_receive( eth_writer_core, { (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, + 0, (uint32_t)byte_size, (uint32_t)eth_l1_byte_address, }); fixture->RunProgram(device, program); - auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size); + std::vector readback_vec; + fixture->ReadBuffer(device, output_dram_buffer, readback_vec); pass &= (readback_vec == inputs); if (not pass) { - std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl; + std::cout << "Mismatch" << std::endl; } return pass; } @@ -195,26 +190,21 @@ bool noc_reader_and_writer_kernels( auto reader_dram_buffer = CreateBuffer(dram_config); auto writer_dram_buffer = CreateBuffer(dram_config); - auto reader_dram_noc_xy = reader_dram_buffer->noc_coordinates(); - auto writer_dram_noc_xy = writer_dram_buffer->noc_coordinates(); - log_debug( tt::LogTest, - "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", + "Device {}: reading {} bytes from dram bank 0 addr {} to ethernet core {} addr {}", device->id(), byte_size, - reader_dram_noc_xy.str(), reader_dram_buffer->address(), logical_eth_core.str(), eth_dst_l1_address); log_debug( tt::LogTest, - "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", + "Device {}: writing {} bytes from ethernet core {} addr {} to dram bank 0 addr {}", device->id(), byte_size, logical_eth_core.str(), eth_src_l1_address, - writer_dram_noc_xy.str(), writer_dram_buffer->address()); auto eth_noc_xy = device->ethernet_core_from_logical_core(logical_eth_core); @@ -231,8 +221,7 @@ bool noc_reader_and_writer_kernels( logical_eth_core, { (uint32_t)reader_dram_buffer->address(), - (uint32_t)reader_dram_noc_xy.x, - (uint32_t)reader_dram_noc_xy.y, + 0, (uint32_t)byte_size, (uint32_t)eth_dst_l1_address, }); @@ -249,8 +238,7 @@ bool noc_reader_and_writer_kernels( logical_eth_core, { (uint32_t)writer_dram_buffer->address(), - (uint32_t)writer_dram_noc_xy.x, - (uint32_t)writer_dram_noc_xy.y, + 0, (uint32_t)byte_size, (uint32_t)eth_src_l1_address, }); diff --git a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp index cd1160969726..d2be295b74bd 100644 --- a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp +++ b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp @@ -62,22 +62,19 @@ bool chip_to_chip_dram_buffer_transfer( // Create source buffer on sender device auto input_dram_buffer = CreateBuffer(sender_dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); // Create dest buffer on receiver device auto output_dram_buffer = CreateBuffer(receiver_dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); log_info( tt::LogTest, - "Sending {} bytes from device {} dram {} addr {} to device {} dram {} addr {}, using eth core {} and {}", + "Sending {} bytes from device {} dram bank 0 addr {} to device {} dram bank 0 addr {}, using eth core {} and " + "{}", byte_size, sender_device->id(), - input_dram_noc_xy.str(), input_dram_byte_address, receiver_device->id(), - output_dram_noc_xy.str(), output_dram_byte_address, eth_sender_core.str(), eth_receiver_core.str()); @@ -113,8 +110,7 @@ bool chip_to_chip_dram_buffer_transfer( eth_sender_core, { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + 0, (uint32_t)remaining_bytes, (uint32_t)num_loops, (uint32_t)MAX_BUFFER, @@ -137,8 +133,7 @@ bool chip_to_chip_dram_buffer_transfer( eth_receiver_core, { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + 0, (uint32_t)remaining_bytes, (uint32_t)num_loops, (uint32_t)MAX_BUFFER, @@ -169,7 +164,7 @@ bool chip_to_chip_dram_buffer_transfer( fixture->ReadBuffer(receiver_device, output_dram_buffer, dest_dram_data); pass &= (dest_dram_data == inputs); if (not pass) { - std::cout << "Mismatch at Core: " << output_dram_noc_xy.str() << std::endl; + std::cout << "Mismatch" << std::endl; std::cout << dest_dram_data[0] << std::endl; } return pass; diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp index 4b93d2a1132d..dc064fbce058 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp @@ -120,10 +120,6 @@ void matmul_tile( uint32_t num_input_tiles = 2 * M; - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig( @@ -200,11 +196,9 @@ void matmul_tile( reader_l1_args = { src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + 0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + 0, (std::uint32_t)K, (std::uint32_t)M, (std::uint32_t)N, @@ -222,11 +216,9 @@ void matmul_tile( reader_l1_args = { src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + 0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + 0, 1, 1, 1, @@ -274,13 +266,8 @@ void matmul_tile( vector bias(N * 512, 0); fixture->WriteBuffer(device, src2_dram_buffer, bias); - auto dram_src2_noc_xy = src2_dram_buffer->noc_coordinates(); vector bias_args = { - src2_dram_buffer->address(), - (std::uint32_t)dram_src2_noc_xy.x, - (std::uint32_t)dram_src2_noc_xy.y, - (std::uint32_t)N, - (std::uint32_t)(N * single_tile_size_bfp16b)}; + src2_dram_buffer->address(), 0, (std::uint32_t)N, (std::uint32_t)(N * single_tile_size_bfp16b)}; for (uint32_t arg : bias_args) { reader_l1_args.push_back(arg); @@ -293,10 +280,7 @@ void matmul_tile( program, unary_writer_kernel, core, - {dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tiles}); // this is M * N in the multi_tile case !! + {dst_dram_buffer->address(), 0, num_tiles}); // this is M * N in the multi_tile case !! fixture->RunProgram(device, program); diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp index a9c32b95c3d0..7044d93b772b 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp @@ -228,17 +228,11 @@ bool matmul_large_block( auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - const std::array mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (std::uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks M * in0_block_w, // input 0 block num tiles N * in0_block_w, // input 1 block num tiles @@ -249,17 +243,12 @@ bool matmul_large_block( string writer_kernel; if (output_rm) { writer_kernel = "tt_metal/kernels/dataflow/writer_unary.cpp"; - writer_rt_args = { - dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - uint(M * N)}; + writer_rt_args = {dst_dram_buffer->address(), 0, uint(M * N)}; } else { writer_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp"; writer_rt_args = { dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + 0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp index acbc84f2f644..27cd8d1a0530 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp @@ -242,10 +242,6 @@ bool matmul_multi_core_single_dram(tt_metal::Device* device) { dram_buffer_dst_addr, dram_buffer_size_out); - auto dram_src0_noc_xy = device->dram_core_from_dram_channel(dram_src0_channel_id); - auto dram_src1_noc_xy = device->dram_core_from_dram_channel(dram_src1_channel_id); - auto dram_dst_noc_xy = device->dram_core_from_dram_channel(dram_dst_channel_id); - auto activations_tilized = test_utils::tilize(activation_slice, per_core_M * 32, K * 32); auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); @@ -261,11 +257,9 @@ bool matmul_multi_core_single_dram(tt_metal::Device* device) { const std::array mm_reader_args = { (std::uint32_t)dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks (std::uint32_t)per_core_M * in0_block_w, // input 0 block num tiles (std::uint32_t)per_core_N * in0_block_w, // input 1 block num tiles @@ -274,8 +268,7 @@ bool matmul_multi_core_single_dram(tt_metal::Device* device) { const std::array writer_args = { (std::uint32_t)dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)per_core_M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp index 95c7569ad4f4..775484b5cd2c 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp @@ -85,10 +85,6 @@ bool matmul_single_core( auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * in0_block_w * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -118,11 +114,9 @@ bool matmul_single_core( std::vector mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + 0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + 0, (std::uint32_t)(K / in0_block_w), // num_blocks (std::uint32_t)(M * in0_block_w), // input 0 block num tiles (std::uint32_t)(N * in0_block_w), // input 1 block num tiles @@ -131,8 +125,7 @@ bool matmul_single_core( std::vector writer_rt_args{ dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + 0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp index 4c4077c4adcb..e5d03095211b 100644 --- a/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp +++ b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp @@ -244,23 +244,23 @@ void build_and_run_autonomous_stream_test( log_trace( tt::LogTest, "sender_core: x={}, y={}", - device->physical_core_from_logical_core(sender_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(sender_core, CoreType::WORKER).y); + device->virtual_core_from_logical_core(sender_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(sender_core, CoreType::WORKER).y); log_trace( tt::LogTest, "first_relay_core: x={}, y={}", - device->physical_core_from_logical_core(first_relay_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(first_relay_core, CoreType::WORKER).y); + device->virtual_core_from_logical_core(first_relay_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(first_relay_core, CoreType::WORKER).y); log_trace( tt::LogTest, "second_relay_core: x={}, y={}", - device->physical_core_from_logical_core(second_relay_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(second_relay_core, CoreType::WORKER).y); + device->virtual_core_from_logical_core(second_relay_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(second_relay_core, CoreType::WORKER).y); log_trace( tt::LogTest, "receiver_core: x={}, y={}", - device->physical_core_from_logical_core(receiver_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(receiver_core, CoreType::WORKER).y); + device->virtual_core_from_logical_core(receiver_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(receiver_core, CoreType::WORKER).y); // Input DRAM buffer creation uint32_t buffer_size_bytes = num_messages * page_size; diff --git a/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp index 2fc6c5133a22..f41e3b2b53cf 100644 --- a/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp +++ b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp @@ -85,9 +85,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf } uint32_t src_address; - CoreCoord src_noc_xy; uint32_t dst_address; - CoreCoord dst_noc_xy; tt_metal::BufferType buff_type = test_config.IO_data_in_dram ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1; @@ -98,9 +96,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf auto dst_buffer = CreateBuffer(buff_config); src_address = src_buffer->address(); - src_noc_xy = src_buffer->noc_coordinates(); dst_address = dst_buffer->address(); - dst_noc_xy = dst_buffer->noc_coordinates(); // create kernels vector receiver_kernels; @@ -173,11 +169,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf program, receiver_kernels.at(core_id), core, - {src_address, - (uint32_t)src_noc_xy.x, - (uint32_t)src_noc_xy.y, - (uint32_t)num_tiles, - (uint32_t)num_repetitions}); + {src_address, 0, (uint32_t)num_tiles, (uint32_t)num_repetitions}); } else { SetRuntimeArgs( program, @@ -196,11 +188,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf program, sender_kernels.at(core_id), core, - {dst_address, - (uint32_t)dst_noc_xy.x, - (uint32_t)dst_noc_xy.y, - (uint32_t)num_tiles, - (uint32_t)num_repetitions}); + {dst_address, 0, (uint32_t)num_tiles, (uint32_t)num_repetitions}); } else { SetRuntimeArgs( program, diff --git a/tests/tt_metal/tt_metal/integration/test_flatten.cpp b/tests/tt_metal/tt_metal/integration/test_flatten.cpp index f36c0d396c1a..a8740c477c29 100644 --- a/tests/tt_metal/tt_metal/integration/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/integration/test_flatten.cpp @@ -87,9 +87,6 @@ bool flatten(DispatchFixture* fixture, tt_metal::Device* device, uint32_t num_ti auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input CB // CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math kernel, // input CB and reader @@ -146,21 +143,9 @@ bool flatten(DispatchFixture* fixture, tt_metal::Device* device, uint32_t num_ti fixture->WriteBuffer(device, src_dram_buffer, src_vec); tt_metal::SetRuntimeArgs( - program, - flatten_kernel, - core, - {dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles_r, - num_tiles_c, - num_bytes_per_tensor_row}); + program, flatten_kernel, core, {dram_buffer_src_addr, 0, num_tiles_r, num_tiles_c, num_bytes_per_tensor_row}); - tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles * 32}); + tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dram_buffer_dst_addr, 0, num_tiles * 32}); fixture->RunProgram(device, program); @@ -246,8 +231,6 @@ bool flatten_stress(Device* device, uint32_t num_tiles_r = 5, uint32_t num_tiles auto src_dram_buffer = CreateBuffer(dram_config); auto dst_dram_buffer = CreateBuffer(dram_config); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); // Create the source vector std::shared_ptr> src_vec = std::make_shared>(create_random_vector_of_bfloat16( @@ -258,14 +241,8 @@ bool flatten_stress(Device* device, uint32_t num_tiles_r = 5, uint32_t num_tiles std::shared_ptr writer_runtime_args = std::make_shared(); std::shared_ptr compute_runtime_args = std::make_shared(); *compute_runtime_args = { - src_dram_buffer.get(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles_r, - num_tiles_c, - num_bytes_per_tensor_row}; - *writer_runtime_args = { - dst_dram_buffer.get(), (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles * 32}; + src_dram_buffer.get(), (uint32_t)0, num_tiles_r, num_tiles_c, num_bytes_per_tensor_row}; + *writer_runtime_args = {dst_dram_buffer.get(), (uint32_t)0, num_tiles * 32}; SetRuntimeArgs(device, detail::GetKernel(program, flatten_kernel), core, compute_runtime_args); diff --git a/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp index d6f793332848..c048face44e0 100644 --- a/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp @@ -120,10 +120,8 @@ bool run_sfpu_all_same_buffer(CommandQueue& cq, const SfpuConfig& test_config) { .buffer_type = tt::tt_metal::BufferType::DRAM}; auto input_dram_buffer = CreateBuffer(dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); vector compute_kernel_args = { uint32_t(test_config.num_tiles), // per_core_block_cnt @@ -145,15 +143,13 @@ bool run_sfpu_all_same_buffer(CommandQueue& cq, const SfpuConfig& test_config) { // Same runtime args for every core vector reader_rt_args = { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }; vector writer_rt_args = { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }; diff --git a/tests/tt_metal/tt_metal/llk/test_broadcast.cpp b/tests/tt_metal/tt_metal/llk/test_broadcast.cpp index 19baa1821845..e1231125fccd 100644 --- a/tests/tt_metal/tt_metal/llk/test_broadcast.cpp +++ b/tests/tt_metal/tt_metal/llk/test_broadcast.cpp @@ -178,26 +178,20 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& auto src_a_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_a_addr = src_a_dram_buffer->address(); - auto src_a_dram_noc_xy = src_a_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_src_a_cb_config = - tt_metal::CircularBufferConfig(single_tile_size, {{0, tt::DataFormat::Float16_b}}) - .set_page_size(0, single_tile_size); + tt_metal::CircularBufferConfig l1_src_a_cb_config = tt_metal::CircularBufferConfig(single_tile_size, {{0, tt::DataFormat::Float16_b}}) + .set_page_size(0, single_tile_size); auto l1_src_a_cb = tt_metal::CreateCircularBuffer(program, core, l1_src_a_cb_config); auto src_b_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_b_addr = src_b_dram_buffer->address(); - auto src_b_dram_noc_xy = src_b_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_src_b_cb_config = - tt_metal::CircularBufferConfig(single_tile_size, {{1, tt::DataFormat::Float16_b}}) - .set_page_size(1, single_tile_size); + tt_metal::CircularBufferConfig l1_src_b_cb_config = tt_metal::CircularBufferConfig(single_tile_size, {{1, tt::DataFormat::Float16_b}}) + .set_page_size(1, single_tile_size); auto l1_src_b_cb = tt_metal::CreateCircularBuffer(program, core, l1_src_b_cb_config); auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dst_dram_noc_xy = dst_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_dst_cb_config = - tt_metal::CircularBufferConfig(single_tile_size, {{16, tt::DataFormat::Float16_b}}) - .set_page_size(16, single_tile_size); + tt_metal::CircularBufferConfig l1_dst_cb_config = tt_metal::CircularBufferConfig(single_tile_size, {{16, tt::DataFormat::Float16_b}}) + .set_page_size(16, single_tile_size); auto l1_dst_cb = tt_metal::CreateCircularBuffer(program, core, l1_dst_cb_config); std::map defines = { @@ -259,12 +253,10 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& core, { (uint32_t)dram_buffer_src_a_addr, - (uint32_t)src_a_dram_noc_xy.x, - (uint32_t)src_a_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)dram_buffer_src_b_addr, - (uint32_t)src_b_dram_noc_xy.x, - (uint32_t)src_b_dram_noc_xy.y, - (uint32_t)1, + (uint32_t)0, // dram bank id + (uint32_t)1, // num tiles }); tt_metal::SetRuntimeArgs( @@ -273,9 +265,8 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& core, { (uint32_t)dram_buffer_dst_addr, - (uint32_t)dst_dram_noc_xy.x, - (uint32_t)dst_dram_noc_xy.y, - (uint32_t)1, + (uint32_t)0, // dram bank id + (uint32_t)1, // num tiles }); std::vector input0 = generate_uniform_random_vector( diff --git a/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp index 9b0b1b0a8bcf..7d77364008cb 100644 --- a/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp +++ b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp @@ -49,9 +49,6 @@ void run_single_core_copy_block_matmul_partials( auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer_bf16->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = test_config.src0_cb_index; uint32_t num_input_tiles = test_config.reader_ublock; @@ -134,8 +131,7 @@ void run_single_core_copy_block_matmul_partials( unary_reader_kernel, core, {dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, + (uint32_t)0, // dram bank id num_tiles, src0_cb_index, test_config.reader_ublock, @@ -146,8 +142,7 @@ void run_single_core_copy_block_matmul_partials( unary_writer_kernel, core, {dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (uint32_t)0, // dram bank id num_tiles, ouput_cb_index, test_config.writer_ublock, diff --git a/tests/tt_metal/tt_metal/llk/test_cumsum.cpp b/tests/tt_metal/tt_metal/llk/test_cumsum.cpp index 378b435554c7..5d87a9f79b3b 100644 --- a/tests/tt_metal/tt_metal/llk/test_cumsum.cpp +++ b/tests/tt_metal/tt_metal/llk/test_cumsum.cpp @@ -71,18 +71,14 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c auto src_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto src_dram_noc_xy = src_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_src_cb_config = - tt_metal::CircularBufferConfig(dram_buffer_size, {{0, tt::DataFormat::Float16_b}}) - .set_page_size(0, single_tile_size); + tt_metal::CircularBufferConfig l1_src_cb_config = tt_metal::CircularBufferConfig(dram_buffer_size, {{0, tt::DataFormat::Float16_b}}) + .set_page_size(0, single_tile_size); auto l1_src_cb = tt_metal::CreateCircularBuffer(program, core, l1_src_cb_config); auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dst_dram_noc_xy = dst_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_dst_cb_config = - tt_metal::CircularBufferConfig(dram_buffer_size, {{16, tt::DataFormat::Float16_b}}) - .set_page_size(16, single_tile_size); + tt_metal::CircularBufferConfig l1_dst_cb_config = tt_metal::CircularBufferConfig(dram_buffer_size, {{16, tt::DataFormat::Float16_b}}) + .set_page_size(16, single_tile_size); auto l1_dst_cb = tt_metal::CreateCircularBuffer(program, core, l1_dst_cb_config); string reader_kernel_name, writer_kernel_name; @@ -126,9 +122,9 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c core, { (uint32_t)dram_buffer_src_addr, - (uint32_t)src_dram_noc_xy.x, - (uint32_t)src_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)test_config.N * test_config.Ht * test_config.Wt, // Used for non transposing kernel + (uint32_t)0, // Unused (uint32_t)test_config.N, // Used for transposing kernel (uint32_t)test_config.Ht, // Used for transposing kernel (uint32_t)test_config.Wt, // Used for transposing kernel @@ -141,9 +137,9 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c core, { (uint32_t)dram_buffer_dst_addr, - (uint32_t)dst_dram_noc_xy.x, - (uint32_t)dst_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)test_config.N * test_config.Ht * test_config.Wt, // Used for non transposing kernel + (uint32_t)0, // Unused (uint32_t)test_config.N, // Used for transposing kernel (uint32_t)test_config.Ht, // Used for transposing kernel (uint32_t)test_config.Wt, // Used for transposing kernel diff --git a/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp index 3f588d713035..2222461d4c30 100644 --- a/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp @@ -162,8 +162,7 @@ bool test_dropout_standalone( core, { src0_dram_buffer->address(), - static_cast(src0_dram_buffer->noc_coordinates().x), - static_cast(src0_dram_buffer->noc_coordinates().y), + 0, // dram bank id num_tiles, }); @@ -172,8 +171,7 @@ bool test_dropout_standalone( unary_writer_kernel_id, core, {dst_dram_buffer->address(), - static_cast(dst_dram_buffer->noc_coordinates().x), - static_cast(dst_dram_buffer->noc_coordinates().y), + 0, // dram bank id num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/llk/test_reconfig.cpp b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp index e15e209be5f7..a52064b1f674 100644 --- a/tests/tt_metal/tt_metal/llk/test_reconfig.cpp +++ b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp @@ -91,27 +91,22 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c // This will be srcB in Bfp8_b auto input0_dram_buffer = CreateBuffer(dram_config_bfp8b); uint32_t input0_dram_byte_address = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); // This will be srcA in Float16_b auto input1_dram_buffer = CreateBuffer(dram_config_bfp16b); uint32_t input1_dram_byte_address = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); // This will be DEST in Float16_b auto input2_dram_buffer = CreateBuffer(dram_config_bfp16b); uint32_t input2_dram_byte_address = input2_dram_buffer->address(); - auto input2_dram_noc_xy = input2_dram_buffer->noc_coordinates(); // This will be Output0 in Float32 or Float16_b depending on fp32_dest_acc_en auto output0_dram_buffer = CreateBuffer(dram_config_out0); uint32_t output0_dram_byte_address = output0_dram_buffer->address(); - auto output0_dram_noc_xy = output0_dram_buffer->noc_coordinates(); // This will be Output1 in Bfp8_b auto output1_dram_buffer = CreateBuffer(dram_config_bfp8b); uint32_t output1_dram_byte_address = output1_dram_buffer->address(); - auto output1_dram_noc_xy = output1_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_input0_cb_config = tt_metal::CircularBufferConfig(dram_buffer_size_bfp8b, {{in0_id, tt::DataFormat::Bfp8_b}}) @@ -254,21 +249,24 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c tt_metal::detail::WriteToBuffer(input1_dram_buffer, src1_vec); tt_metal::detail::WriteToBuffer(input2_dram_buffer, src2_vec); + static constexpr uint32_t k_input0_dram_bank_id = 0; + static constexpr uint32_t k_input1_dram_bank_id = 0; + static constexpr uint32_t k_input2_dram_bank_id = 0; + static constexpr uint32_t k_output0_dram_bank_id = 0; + static constexpr uint32_t k_output1_dram_bank_id = 0; + tt_metal::SetRuntimeArgs( program, reader_kernel, core, { (uint32_t)input0_dram_byte_address, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + k_input0_dram_bank_id, // dram bank id (uint32_t)input1_dram_byte_address, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + k_input1_dram_bank_id, (uint32_t)test_config.num_tiles, (uint32_t)input2_dram_byte_address, - (uint32_t)input2_dram_noc_xy.x, - (uint32_t)input2_dram_noc_xy.y, + k_input2_dram_bank_id, }); tt_metal::SetRuntimeArgs( program, @@ -276,12 +274,10 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c core, { (uint32_t)output0_dram_byte_address, - (uint32_t)output0_dram_noc_xy.x, - (uint32_t)output0_dram_noc_xy.y, + k_output0_dram_bank_id, (uint32_t)out0_id, (uint32_t)output1_dram_byte_address, - (uint32_t)output1_dram_noc_xy.x, - (uint32_t)output1_dram_noc_xy.y, + k_output1_dram_bank_id, (uint32_t)out1_id, (uint32_t)test_config.num_tiles, (uint32_t)test_config.ublock_size_tiles, diff --git a/tests/tt_metal/tt_metal/llk/test_reduce.cpp b/tests/tt_metal/tt_metal/llk/test_reduce.cpp index 7d2d51556ad7..898827b074bb 100644 --- a/tests/tt_metal/tt_metal/llk/test_reduce.cpp +++ b/tests/tt_metal/tt_metal/llk/test_reduce.cpp @@ -145,10 +145,11 @@ void add_reader_writer_kernels( program, unary_writer_kernel, logical_core, - {dst_dram_buffer->address(), - (std::uint32_t)dst_dram_buffer->noc_coordinates().x, - (std::uint32_t)dst_dram_buffer->noc_coordinates().y, - num_tensor_tiles / Ht}); + { + dst_dram_buffer->address(), + (uint32_t)0, // dram bank id + num_tensor_tiles / Ht // num tiles + }); break; } @@ -176,8 +177,8 @@ void add_reader_writer_kernels( logical_core, { src_dram_buffer->address(), - (std::uint32_t)src_dram_buffer->noc_coordinates().x, - (std::uint32_t)src_dram_buffer->noc_coordinates().y, + (uint32_t)0, // dram bank id + (uint32_t)0, // unused num_tensor_tiles, NC, Ht, @@ -193,8 +194,7 @@ void add_reader_writer_kernels( unary_writer_kernel, logical_core, {dst_dram_buffer->address(), - (std::uint32_t)dst_dram_buffer->noc_coordinates().x, - (std::uint32_t)dst_dram_buffer->noc_coordinates().y, + (uint32_t)0, // dram bank id num_tiles}); break; diff --git a/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp index 0e2d99065b1d..ec74db1eeab7 100644 --- a/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp @@ -124,10 +124,8 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c auto input_dram_buffer = CreateBuffer(dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); vector compute_kernel_args = { uint32_t(test_config.num_tiles), // per_core_block_cnt @@ -149,15 +147,13 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c // Same runtime args for every core vector reader_rt_args = { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + (uint32_t)0, (uint32_t)test_config.num_tiles, }; vector writer_rt_args = { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + (uint32_t)0, (uint32_t)test_config.num_tiles, }; diff --git a/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp index 19e2534d941f..39c9e4c91831 100644 --- a/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp @@ -92,19 +92,15 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM}; auto input0_dram_buffer = CreateBuffer(dram_config); uint32_t input0_dram_byte_address = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); auto input1_dram_buffer = CreateBuffer(dram_config); uint32_t input1_dram_byte_address = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); auto input2_dram_buffer = CreateBuffer(dram_config); uint32_t input2_dram_byte_address = input2_dram_buffer->address(); - auto input2_dram_noc_xy = input2_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_cb_config = tt_metal::CircularBufferConfig(byte_size, {{0, test_config.l1_input_data_format}}) @@ -244,15 +240,12 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& test_config.core, { (uint32_t)input0_dram_byte_address, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)input1_dram_byte_address, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)test_config.num_tiles, (uint32_t)input2_dram_byte_address, - (uint32_t)input2_dram_noc_xy.x, - (uint32_t)input2_dram_noc_xy.y, + (uint32_t)0, // dram bank id }); tt_metal::SetRuntimeArgs( program, @@ -260,8 +253,7 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& test_config.core, { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)test_config.num_tiles, }); diff --git a/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp index 8e624df6a67c..34ad0b088189 100644 --- a/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp @@ -179,13 +179,10 @@ bool single_tile_matmul(tt_metal::Device* device) { tt_metal::Program program = tt_metal::CreateProgram(); auto input0_dram_buffer = CreateBuffer(dram_config); const uint32_t in0_dram_addr = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); auto input1_dram_buffer = CreateBuffer(dram_config); const uint32_t in1_dram_addr = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); const uint32_t out_dram_addr = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_input0_cb_config = tt_metal::CircularBufferConfig(byte_size, {{in0_cb_index, tt::DataFormat::Float16_b}}) @@ -256,11 +253,9 @@ bool single_tile_matmul(tt_metal::Device* device) { core, { (uint32_t)in0_dram_addr, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + (uint32_t)0, // in_0 dram bank id (uint32_t)in1_dram_addr, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + (uint32_t)0, (uint32_t)1, // num_tiles }); tt_metal::SetRuntimeArgs( @@ -269,9 +264,8 @@ bool single_tile_matmul(tt_metal::Device* device) { core, { (uint32_t)out_dram_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, - (uint32_t)1, + (uint32_t)0, + (uint32_t)1, // num_tiles }); tt_metal::detail::LaunchProgram(device, program); @@ -323,12 +317,9 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 tt_metal::Program program = tt_metal::CreateProgram(); auto input0_dram_buffer = CreateBuffer(dram_config_0); const uint32_t in0_dram_addr = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); auto input1_dram_buffer = CreateBuffer(dram_config_1); const uint32_t in1_dram_addr = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config_out); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); const uint32_t out_dram_addr = output_dram_buffer->address(); tt_metal::CircularBufferConfig l1_input0_cb_config = @@ -403,11 +394,9 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 core, { (uint32_t)in0_dram_addr, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + (uint32_t)0, (uint32_t)in1_dram_addr, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + (uint32_t)0, (uint32_t)1, // num_blocks (uint32_t)M * K, // in0_block_tile_cnt (uint32_t)K * N, // in1_block_tile_cnt @@ -420,8 +409,7 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 core, { (uint32_t)out_dram_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + (uint32_t)0, (uint32_t)M * N, }); @@ -483,18 +471,13 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N tt_metal::Program program = tt_metal::CreateProgram(); auto input0_dram_buffer = CreateBuffer(dram_config_0); const uint32_t in0_dram_addr = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); auto input1_dram_buffer = CreateBuffer(dram_config_1); const uint32_t in1_dram_addr = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config_out); const uint32_t out_dram_addr = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - - tt_metal::CircularBufferConfig l1_input0_cb_config = - tt_metal::CircularBufferConfig(in0_byte_size, {{in0_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(in0_cb_index, cb_page_size); + tt_metal::CircularBufferConfig l1_input0_cb_config = tt_metal::CircularBufferConfig(in0_byte_size, {{in0_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(in0_cb_index, cb_page_size); auto l1_input0_cb = tt_metal::CreateCircularBuffer(program, core, l1_input0_cb_config); tt_metal::CircularBufferConfig l1_input1_cb_config = @@ -580,11 +563,9 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N core, { (uint32_t)in0_dram_addr, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + (uint32_t)0, (uint32_t)in1_dram_addr, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + (uint32_t)0, (uint32_t)1, // num_blocks (uint32_t)M * K, // in0_block_tile_cnt (uint32_t)K * N, // in1_block_tile_cnt @@ -597,8 +578,7 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N core, { (uint32_t)out_dram_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + (uint32_t)0, (uint32_t)M * N, }); diff --git a/tests/tt_metal/tt_metal/llk/test_transpose.cpp b/tests/tt_metal/tt_metal/llk/test_transpose.cpp index f8124dd3df16..d018f1ba74cd 100644 --- a/tests/tt_metal/tt_metal/llk/test_transpose.cpp +++ b/tests/tt_metal/tt_metal/llk/test_transpose.cpp @@ -108,9 +108,6 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig& std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - CoreCoord dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - CoreCoord dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_buffer_tiles = 32; tt_metal::CircularBufferConfig cb_src0_config = @@ -161,8 +158,8 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig& core, { dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, + (uint32_t)0, // unused to maintain compat + (uint32_t)0, // unused to maintain compat num_tensor_tiles, NC, Ht, @@ -175,7 +172,9 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig& program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tensor_tiles}); + {dram_buffer_dst_addr, + (uint32_t)0, // unused to maintain compat + num_tensor_tiles}); auto seed = std::chrono::system_clock::now().time_since_epoch().count(); vector src_vec = create_random_vector_of_bfloat16(dram_buffer_size, 100.0f, 0x1234); diff --git a/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp index 85328167e4b1..9a25538f9579 100644 --- a/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp +++ b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp @@ -87,9 +87,6 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& std::shared_ptr dst_dram_buffer = CreateBuffer(output_dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - CoreCoord dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - CoreCoord dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = tt::CBIndex::c_0; uint32_t num_input_tiles = num_tiles; tt_metal::CircularBufferConfig cb_src0_config = @@ -105,7 +102,6 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& if (test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) { src1_dram_buffer = CreateBuffer(input_dram_config); dram_buffer_src1_addr = src1_dram_buffer->address(); - dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); uint32_t src1_cb_index = tt::CBIndex::c_1; uint32_t num_input_tiles = num_tiles; @@ -195,18 +191,17 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& std::vector src1_vec; - if (test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) { + if(test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) { + // tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp tt_metal::SetRuntimeArgs( program, reader_kernel, core, { dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, // dram bank id dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)num_tiles, }); @@ -214,24 +209,20 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& tt_metal::detail::WriteToBuffer(src1_dram_buffer, src1_vec); } else { + // tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp tt_metal::SetRuntimeArgs( program, reader_kernel, core, {dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, // dram bank id num_tiles, src0_cb_index, test_config.num_tiles_c, false}); } - tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dram_buffer_dst_addr, (uint32_t)0, num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp index 2e49aec6d0c8..16249c9f9f85 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp @@ -229,293 +229,14 @@ uint32_t get_dram_bandwidth(tt::ARCH arch) { return dram_bandwidth_gb_per_sec; } -void get_dram_reader_core_coords_grayskull( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for grayskull - uint32_t full_grid_size_y = 12; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - uint32_t max_worker_y_physical = 0; - uint32_t min_worker_y_physical = 10000; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - if (core_phy.y > max_worker_y_physical) { - max_worker_y_physical = core_phy.y; - } - if (core_phy.y < min_worker_y_physical) { - min_worker_y_physical = core_phy.y; - } - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x; - uint32_t adj_core_y = dram_core.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto y = coord.y; - - // if row is harvested, move core down by 1 - while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and - y < (full_grid_size_y - 1)) { - y += 1; - } - - coord.y = y; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - -void get_dram_reader_core_coords( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - uint32_t full_grid_size_x = device->grid_size().x; - uint32_t full_grid_size_y = device->grid_size().y; - uint32_t x_step = 3; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - dram_coord_phy.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - all_worker_cores_logical.reserve(num_cores_x * num_cores_y); - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - all_worker_cores_y_physical.reserve(num_cores_y); - uint32_t max_worker_y_physical = 0; - uint32_t min_worker_y_physical = 10000; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - if (core_phy.y > max_worker_y_physical) { - max_worker_y_physical = core_phy.y; - } - if (core_phy.y < min_worker_y_physical) { - min_worker_y_physical = core_phy.y; - } - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - adj_core_physical.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // split the adjacent coords into two groups, because DRAM banks has two cols - std::vector adj_core_physical_g1; - adj_core_physical_g1.reserve(num_banks); - std::vector adj_core_physical_y_g1; - adj_core_physical_y_g1.reserve(num_banks); - std::vector adj_core_physical_g2; - adj_core_physical_g2.reserve(num_banks); - std::vector adj_core_physical_y_g2; - adj_core_physical_y_g2.reserve(num_banks); - for (auto core : adj_core_physical) { - if (core.x == adj_core_physical.front().x) { - adj_core_physical_g1.push_back(core); - } else { - adj_core_physical_g2.push_back(core); - } - } - std::vector indices_g1(adj_core_physical_g1.size()); - std::vector indices_g2(adj_core_physical_g2.size()); - std::iota(indices_g1.begin(), indices_g1.end(), 0); - std::iota(indices_g2.begin(), indices_g2.end(), 0); - std::sort(indices_g1.begin(), indices_g1.end(), [&adj_core_physical_g1](int i1, int i2) { - return adj_core_physical_g1[i1].y < adj_core_physical_g1[i2].y; - }); - std::sort(indices_g2.begin(), indices_g2.end(), [&adj_core_physical_g2](int i1, int i2) { - return adj_core_physical_g2[i1].y < adj_core_physical_g2[i2].y; - }); - std::rotate(indices_g1.begin(), indices_g1.end() - 1, indices_g1.end()); - std::rotate(indices_g2.begin(), indices_g2.end() - 1, indices_g2.end()); - - std::vector indices_g1_realloc(adj_core_physical_g1.size()); - std::vector indices_g2_realloc(adj_core_physical_g2.size()); - for (int new_index = 0; new_index < indices_g1.size(); ++new_index) { - indices_g1_realloc[indices_g1[new_index]] = new_index; - } - for (int new_index = 0; new_index < indices_g2.size(); ++new_index) { - indices_g2_realloc[indices_g2[new_index]] = new_index; - } - - std::sort(adj_core_physical_g1.begin(), adj_core_physical_g1.end(), [](const CoreCoord& a, const CoreCoord& b) { - return a.y < b.y; - }); - std::sort(adj_core_physical_g2.begin(), adj_core_physical_g2.end(), [](const CoreCoord& a, const CoreCoord& b) { - return a.y < b.y; - }); - std::rotate(adj_core_physical_g1.begin(), adj_core_physical_g1.end() - 1, adj_core_physical_g1.end()); - std::rotate(adj_core_physical_g2.begin(), adj_core_physical_g2.end() - 1, adj_core_physical_g2.end()); - - for (auto core : adj_core_physical_g1) { - adj_core_physical_y_g1.push_back(core.y); - } - for (auto core : adj_core_physical_g2) { - adj_core_physical_y_g2.push_back(core.y); - } - - // move the workers, if they are on harvested rows - auto process_group = [&](std::vector& group, std::vector& group_y, uint32_t x_step) { - for (auto& coord : group) { - auto y = coord.y; - - if (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() || - std::count(group_y.begin(), group_y.end(), y) >= 2) { - auto adjust_coord = [&](int start, int end, int step) { - bool found_new_row = false; - for (int j = start; step > 0 ? j <= end : j >= end; j += step) { - if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end() && - std::count(group_y.begin(), group_y.end(), j) == 0) { - coord.y = j; - coord.x += x_step; - x_step--; - found_new_row = true; - break; - } - } - if (not found_new_row) { - for (int j = start; step > 0 ? j <= end : j >= end; j += step) { - if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end()) { - coord.y = j; - coord.x += x_step; - x_step--; - found_new_row = true; - break; - } - } - } - }; - - if (y >= max_bank_id) { - adjust_coord(max_worker_y_physical, min_worker_y_physical, -1); - } else { - adjust_coord(min_worker_y_physical, max_worker_y_physical, 1); - } - } - } - }; - // move the workers, if they are on harvested rows - process_group(adj_core_physical_g1, adj_core_physical_y_g1, x_step); - process_group(adj_core_physical_g2, adj_core_physical_y_g2, x_step); - - // merge two group into one - std::vector adj_core_physical_realloc; - adj_core_physical_realloc.reserve(num_banks); - for (int i = 0; i < indices_g1_realloc.size(); ++i) { - adj_core_physical_realloc.push_back(adj_core_physical_g1[indices_g1_realloc[i]]); - } - for (int i = 0; i < indices_g2_realloc.size(); ++i) { - adj_core_physical_realloc.push_back(adj_core_physical_g2[indices_g2_realloc[i]]); - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - adj_core_logical_realloc.reserve(num_banks); - for (int i = 0; i < adj_core_physical_realloc.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical_realloc[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets +void get_optimal_dram_bank_to_reader_assignment( + Device* device, std::vector& all_worker_cores_ordered, CoreRangeSet& all_worker_cores) { + all_worker_cores_ordered = device->get_optimal_dram_bank_to_logical_worker_assignment(); std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (const auto& worker_core : all_worker_cores_ordered) { + all_cores_set.insert(CoreRange(worker_core)); } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; + all_worker_cores = CoreRangeSet(all_cores_set); } int main(int argc, char** argv) { @@ -640,18 +361,14 @@ int main(int argc, char** argv) { CoreRangeSet all_cores; std::vector all_cores_list; - if (device->arch() == tt::ARCH::GRAYSKULL) { - get_dram_reader_core_coords_grayskull(device, all_cores, all_cores_list); - } else { - get_dram_reader_core_coords(device, all_cores, all_cores_list); - } + get_optimal_dram_bank_to_reader_assignment(device, all_cores_list, all_cores); uint32_t num_tiles_per_core = num_tiles / num_cores; uint32_t num_tiles_cb = num_tiles_per_core / num_blocks; for (auto core : all_cores_list) { - auto phys_core = device->worker_core_from_logical_core(core); - log_info("logical core: {}, physical coer: {}", core, phys_core); + auto virtual_core = device->worker_core_from_logical_core(core); + log_info("logical core: {}, virtual core: {}", core, virtual_core); } log_info( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp index 03d8cce586b0..d5abe69e1a6c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp @@ -336,453 +336,59 @@ uint32_t get_dram_bandwidth(tt::ARCH arch) { return dram_bandwidth_gb_per_sec; } -void get_dram_reader_core_coords_blackhole( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - uint32_t full_grid_size_x = soc_d.grid_size.x; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get x coords of the workers - std::vector all_worker_cores_x_physical; - for (int i = 0; i < num_cores_x; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(i, 0)); - all_worker_cores_x_physical.push_back(core_phy.x); - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_cols; - for (int i = 0; i < full_grid_size_x; ++i) { - auto x = i; - - if (std::find(all_worker_cores_x_physical.begin(), all_worker_cores_x_physical.end(), x) == - all_worker_cores_x_physical.end()) { - harvested_cols.push_back(x); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested cols - for (auto& coord : adj_core_physical) { - auto x = coord.x; - - // if row is harvested, move core down by 1 - while (std::find(harvested_cols.begin(), harvested_cols.end(), x) != harvested_cols.end() and - x < (full_grid_size_x - 1)) { - x += 1; - } - - coord.x = x; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets +void get_optimal_dram_bank_to_reader_assignment( + Device* device, std::vector& all_worker_cores_ordered, CoreRangeSet& all_worker_cores) { + all_worker_cores_ordered = device->get_optimal_dram_bank_to_logical_worker_assignment(); std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (const auto& worker_core : all_worker_cores_ordered) { + all_cores_set.insert(CoreRange(worker_core)); } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; + all_worker_cores = CoreRangeSet(all_cores_set); } -void get_l1_writer_core_coords_blackhole( - tt_metal::Device* device, - std::vector& all_dram_reader_cores, - CoreRangeSet& all_cores, - std::vector& all_cores_ordered) { - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - uint32_t full_grid_size_x = soc_d.grid_size.x; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get x coords of the workers - std::vector all_worker_cores_x_physical; - for (int i = 0; i < num_cores_x; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(i, 0)); - all_worker_cores_x_physical.push_back(core_phy.x); - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_cols; - for (int i = 0; i < full_grid_size_x; ++i) { - auto x = i; - - if (std::find(all_worker_cores_x_physical.begin(), all_worker_cores_x_physical.end(), x) == - all_worker_cores_x_physical.end()) { - harvested_cols.push_back(x); - } - } - - // get the ajacent cores of DRAM readers, for grayskull the l1 writers are below DRAM readers - std::vector adj_core_physical; +void get_l1_writer_core_coords_wormhole_b0( + std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { + // Place writers horizontally next to DRAM readers in logical space (no column harvesting for WH) for (int i = 0; i < all_dram_reader_cores.size(); ++i) { auto dram_reader_core = all_dram_reader_cores[i]; - auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core); - uint32_t adj_core_x = dram_reader_core_phy.x + 1; - uint32_t adj_core_y = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - uint32_t adj_core_x2 = dram_reader_core_phy.x + 2; - uint32_t adj_core_y2 = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 1, dram_reader_core.y)); + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 2, dram_reader_core.y)); } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto x = coord.x; - - // if row is harvested, move core down by 1 - while (std::find(harvested_cols.begin(), harvested_cols.end(), x) != harvested_cols.end() and - x < (full_grid_size_x - 1)) { - x += 1; - } - - coord.x = x; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets std::set all_cores_set; - for (int i = 0; i < adj_core_logical_realloc.size(); ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (int i = 0; i < all_cores_ordered.size(); ++i) { + all_cores_set.insert(CoreRange(all_cores_ordered[i])); } all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; } -void get_dram_reader_core_coords_grayskull( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - uint32_t full_grid_size_y = soc_d.grid_size.y; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x; - uint32_t adj_core_y = dram_core.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto y = coord.y; - - // if row is harvested, move core down by 1 - while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and - y < (full_grid_size_y - 1)) { - y += 1; - } - - coord.y = y; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - -void get_l1_writer_core_coords_grayskull( - tt_metal::Device* device, - std::vector& all_dram_reader_cores, - CoreRangeSet& all_cores, - std::vector& all_cores_ordered) { - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - uint32_t full_grid_size_y = soc_d.grid_size.y; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM readers, for grayskull the l1 writers are below DRAM readers - std::vector adj_core_physical; +void get_l1_writer_core_coords_blackhole( + std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { + // Place writers horizontally next to DRAM readers in logical space (column harvesting enabled for BH incrementing + // in logical space can lead to physical physical columns being skipped when placing writers next to readers) for (int i = 0; i < all_dram_reader_cores.size(); ++i) { auto dram_reader_core = all_dram_reader_cores[i]; - auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core); - uint32_t adj_core_x = dram_reader_core_phy.x; - uint32_t adj_core_y = dram_reader_core_phy.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - uint32_t adj_core_x2 = dram_reader_core_phy.x + 1; - uint32_t adj_core_y2 = dram_reader_core_phy.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); - } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto y = coord.y; - - // if row is harvested, move core down by 1 - while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and - y < (full_grid_size_y - 1)) { - y += 1; - } - - coord.y = y; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 1, dram_reader_core.y)); + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 2, dram_reader_core.y)); } - - // create sets std::set all_cores_set; - for (int i = 0; i < adj_core_logical_realloc.size(); ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (int i = 0; i < all_cores_ordered.size(); ++i) { + all_cores_set.insert(CoreRange(all_cores_ordered[i])); } all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; } -void get_dram_reader_core_coords_wormhole_b0( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - dram_coord_phy.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - all_worker_cores_logical.reserve(num_cores_x * num_cores_y); - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - adj_core_physical.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // find the logical coord from physical coord - std::vector adj_core_logical; - adj_core_logical.reserve(num_banks); - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical; -} - -void get_l1_writer_core_coords_wormhole_b0( - tt_metal::Device* device, - std::vector& all_dram_reader_cores, - CoreRangeSet& all_cores, - std::vector& all_cores_ordered) { - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get the ajacent cores of DRAM readers, for wormhole the l1 writers are on the left or right DRAM readers - std::vector adj_core_physical; +void get_l1_writer_core_coords_grayskull( + std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { for (int i = 0; i < all_dram_reader_cores.size(); ++i) { auto dram_reader_core = all_dram_reader_cores[i]; - auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core); - uint32_t adj_core_x1 = dram_reader_core_phy.x + 1; - uint32_t adj_core_y1 = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x1, adj_core_y1)); - uint32_t adj_core_x2 = dram_reader_core_phy.x + 2; - uint32_t adj_core_y2 = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x, dram_reader_core.y + 1)); + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 1, dram_reader_core.y + 1)); } - - // create sets std::set all_cores_set; - for (int i = 0; i < adj_core_logical_realloc.size(); ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (int i = 0; i < all_cores_ordered.size(); ++i) { + all_cores_set.insert(CoreRange(all_cores_ordered[i])); } all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; } int main(int argc, char** argv) { @@ -804,39 +410,38 @@ int main(int argc, char** argv) { log_info("start DRAM benchmark"); + // try { + //////////////////////////////////////////////////////////////////////////// + // Initial Runtime Args Parse + //////////////////////////////////////////////////////////////////////////// + std::vector input_args(argv, argv + argc); try { - //////////////////////////////////////////////////////////////////////////// - // Initial Runtime Args Parse - //////////////////////////////////////////////////////////////////////////// - std::vector input_args(argv, argv + argc); - try { - std::tie(k, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); + std::tie(k, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); - std::tie(n, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12 * 128); + std::tie(n, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12 * 128); - std::tie(num_blocks, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8); + std::tie(num_blocks, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8); - std::tie(num_tests, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-tests", 1); + std::tie(num_tests, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-tests", 1); - std::tie(use_device_profiler, input_args) = - test_args::has_command_option_and_remaining_args(input_args, "--use-device-profiler"); + std::tie(use_device_profiler, input_args) = + test_args::has_command_option_and_remaining_args(input_args, "--use-device-profiler"); - std::tie(bypass_check, input_args) = - test_args::has_command_option_and_remaining_args(input_args, "--bypass-check"); + std::tie(bypass_check, input_args) = + test_args::has_command_option_and_remaining_args(input_args, "--bypass-check"); - std::tie(df, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 2); + std::tie(df, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 2); - std::tie(num_banks, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-banks", 12); + std::tie(num_banks, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-banks", 12); - std::tie(bank_start_id, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--bank-start-id", 0); + std::tie(bank_start_id, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--bank-start-id", 0); - test_args::validate_remaining_args(input_args); + test_args::validate_remaining_args(input_args); } catch (const std::exception& e) { log_error(tt::LogTest, "Command line arguments found exception", e.what()); TT_ASSERT(false); @@ -901,7 +506,7 @@ int main(int argc, char** argv) { tt_metal::DispatchCoreConfig{tt_metal::DispatchCoreType::WORKER, tt_metal::DispatchCoreAxis::ROW}; } else { dispatch_core_config = - tt_metal::DispatchCoreConfig{tt_metal::DispatchCoreType::WORKER, tt_metal::DispatchCoreAxis::COL}; + tt_metal::DispatchCoreConfig{tt_metal::DispatchCoreType::WORKER, tt_metal::DispatchCoreAxis::ROW}; } tt_metal::Device* device = tt_metal::CreateDevice(device_id, 1, 0, 0, dispatch_core_config); dram_bandwidth_spec = get_dram_bandwidth(device->arch()); @@ -921,18 +526,17 @@ int main(int argc, char** argv) { std::vector all_dram_reader_cores_ordered; CoreRangeSet all_l1_receiver_cores; std::vector all_l1_writer_cores_ordered; + get_optimal_dram_bank_to_reader_assignment(device, all_dram_reader_cores_ordered, all_dram_reader_cores); + if (device->arch() == tt::ARCH::BLACKHOLE) { - get_dram_reader_core_coords_blackhole(device, all_dram_reader_cores, all_dram_reader_cores_ordered); get_l1_writer_core_coords_blackhole( - device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); + all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); } else if (device->arch() == tt::ARCH::WORMHOLE_B0) { - get_dram_reader_core_coords_wormhole_b0(device, all_dram_reader_cores, all_dram_reader_cores_ordered); get_l1_writer_core_coords_wormhole_b0( - device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); + all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); } else { - get_dram_reader_core_coords_grayskull(device, all_dram_reader_cores, all_dram_reader_cores_ordered); get_l1_writer_core_coords_grayskull( - device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); + all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); } uint32_t num_tiles_per_core = num_tiles / num_cores; @@ -941,12 +545,12 @@ int main(int argc, char** argv) { log_info("all_dram_reader_cores"); for (auto core : all_dram_reader_cores_ordered) { auto phys_core = device->worker_core_from_logical_core(core); - log_info("logical core: {}, physical core: {}", core, phys_core); + log_info("logical core: {}, virtual core: {}", core, phys_core); } log_info("all_l1_writer_cores"); for (auto core : all_l1_writer_cores_ordered) { auto phys_core = device->worker_core_from_logical_core(core); - log_info("logical core: {}, physical core: {}", core, phys_core); + log_info("logical core: {}, virtual core: {}", core, phys_core); } log_info( @@ -1048,29 +652,29 @@ int main(int argc, char** argv) { } pass &= tt_metal::CloseDevice(device); - } catch (const std::exception& e) { - pass = false; - // Capture the exception error message - log_error(LogTest, "{}", e.what()); - // Capture system call errors that may have returned from driver/kernel - log_error(LogTest, "System error message: {}", std::strerror(errno)); - } - - // Determine if it passes performance goal - auto avg_dram_bandwidth = calculate_average(dram_bandwidth); - if (pass && bypass_check == false) { - // goal is 90% of peak DRAM bandwidth performance - double target_bandwidth = static_cast(dram_bandwidth_spec) * 0.9; - if (avg_dram_bandwidth < target_bandwidth) { - pass = false; - log_error( - LogTest, - "The DRAM bandwidth does not meet the criteria. " - "Current: {:.3f}GB/s, goal: {:.3f}GB/s", - avg_dram_bandwidth, - target_bandwidth); + // } catch (const std::exception& e) { + // pass = false; + // // Capture the exception error message + // log_error(LogTest, "{}", e.what()); + // // Capture system call errors that may have returned from driver/kernel + // log_error(LogTest, "System error message: {}", std::strerror(errno)); + // } + + // Determine if it passes performance goal + auto avg_dram_bandwidth = calculate_average(dram_bandwidth); + if (pass && bypass_check == false) { + // goal is 90% of peak DRAM bandwidth performance + double target_bandwidth = static_cast(dram_bandwidth_spec) * 0.9; + if (avg_dram_bandwidth < target_bandwidth) { + pass = false; + log_error( + LogTest, + "The DRAM bandwidth does not meet the criteria. " + "Current: {:.3f}GB/s, goal: {:.3f}GB/s", + avg_dram_bandwidth, + target_bandwidth); + } } - } if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 5edfc3fa4b10..08661f7d616c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -138,7 +138,7 @@ DeviceData::DeviceData( auto num_banks = device->num_banks(BufferType::DRAM); for (int bank_id = 0; bank_id < num_banks; bank_id++) { auto dram_channel = device->dram_channel_from_bank_id(bank_id); - CoreCoord phys_core = device->dram_core_from_dram_channel(dram_channel); + CoreCoord phys_core = device->logical_core_from_dram_channel(dram_channel); int32_t bank_offset = device->bank_offset(BufferType::DRAM, bank_id); this->all_data[phys_core][bank_id] = one_core_data_t(); this->all_data[phys_core][bank_id].logical_core = phys_core; @@ -187,7 +187,7 @@ void DeviceData::prepopulate_dram(Device* device, uint32_t size_words) { for (int bank_id = 0; bank_id < num_dram_banks; bank_id++) { auto offset = device->bank_offset(BufferType::DRAM, bank_id); auto dram_channel = device->dram_channel_from_bank_id(bank_id); - auto bank_core = device->dram_core_from_dram_channel(dram_channel); + auto bank_core = device->logical_core_from_dram_channel(dram_channel); one_core_data_t& data = this->all_data[bank_core][bank_id]; // Generate random or coherent data per bank of specific size. @@ -212,12 +212,7 @@ void DeviceData::prepopulate_dram(Device* device, uint32_t size_words) { } // Write to device once per bank (appropriate core and offset) - tt::Cluster::instance().write_core( - static_cast(&data.data[0]), - data.data.size() * sizeof(uint32_t), - tt_cxy_pair(device->id(), bank_core), - this->base_data_addr[static_cast(CoreType::DRAM)] + offset); - ; + tt::tt_metal::detail::WriteToDeviceDRAMChannel(device, bank_id, this->base_data_addr[static_cast(CoreType::DRAM)], data.data); this->base_result_data_addr[static_cast(CoreType::DRAM)] = this->base_data_addr[static_cast(CoreType::DRAM)] + data.data.size() * sizeof(uint32_t); @@ -386,8 +381,13 @@ inline bool DeviceData::validate_one_core( } // Read results from device and compare to expected for this core. - result_addr += bank_offset; - std::vector results = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, result_addr, size_bytes); + std::vector results; + if (core_type == CoreType::DRAM) { + tt::tt_metal::detail::ReadFromDeviceDRAMChannel(device, bank_id, result_addr, size_bytes, results); + } else { + result_addr += bank_offset; + results = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, result_addr, size_bytes); + } log_info( tt::LogTest, @@ -534,28 +534,22 @@ void configure_kernel_variant( NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index) { - const auto& grid_size = device->grid_size(); + auto my_virtual_noc_coords = device->virtual_noc_coordinate(my_noc_index, phys_my_core); + auto upstream_virtual_noc_coords = device->virtual_noc_coordinate(upstream_noc_index, phys_upstream_core); + auto downstream_virtual_noc_coords = device->virtual_noc_coordinate(downstream_noc_index, phys_downstream_core); std::map defines = { - {"MY_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(my_noc_index, grid_size.x, phys_my_core.x))}, - {"MY_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(my_noc_index, grid_size.y, phys_my_core.y))}, + {"DISPATCH_KERNEL", "1"}, + {"MY_NOC_X", std::to_string(my_virtual_noc_coords.x)}, + {"MY_NOC_Y", std::to_string(my_virtual_noc_coords.y)}, {"UPSTREAM_NOC_INDEX", std::to_string(upstream_noc_index)}, - {"UPSTREAM_NOC_X", - std::to_string(tt::tt_metal::hal.noc_coordinate(upstream_noc_index, grid_size.x, phys_upstream_core.x))}, - {"UPSTREAM_NOC_Y", - std::to_string(tt::tt_metal::hal.noc_coordinate(upstream_noc_index, grid_size.y, phys_upstream_core.y))}, - {"DOWNSTREAM_NOC_X", - std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.x, phys_downstream_core.x))}, - {"DOWNSTREAM_NOC_Y", - std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.y, phys_downstream_core.y))}, - {"DOWNSTREAM_SLAVE_NOC_X", - std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.x, 0xff))}, - {"DOWNSTREAM_SLAVE_NOC_Y", - std::to_string(tt::tt_metal::hal.noc_coordinate( - downstream_noc_index, - grid_size.y, - 0xff))}, // todo, add testing with dispatch_s once it processes more than go signals - {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth + {"UPSTREAM_NOC_X", std::to_string(upstream_virtual_noc_coords.x)}, + {"UPSTREAM_NOC_Y", std::to_string(upstream_virtual_noc_coords.y)}, + {"DOWNSTREAM_NOC_X", std::to_string(downstream_virtual_noc_coords.x)}, + {"DOWNSTREAM_NOC_Y", std::to_string(downstream_virtual_noc_coords.y)}, + {"DOWNSTREAM_SLAVE_NOC_X", std::to_string(0xff)}, + {"DOWNSTREAM_SLAVE_NOC_Y", std::to_string(0xff)}, // todo, add dispatch_s testing + {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; compile_args.push_back(is_dram_variant); compile_args.push_back(is_host_variant); @@ -663,7 +657,7 @@ inline void generate_random_paged_payload( if (is_dram) { auto dram_channel = device->dram_channel_from_bank_id(bank_id); - bank_core = device->dram_core_from_dram_channel(dram_channel); + bank_core = device->logical_core_from_dram_channel(dram_channel); } else { bank_core = device->logical_core_from_bank_id(bank_id); } @@ -913,8 +907,8 @@ inline void gen_dispatcher_multicast_write_cmd( CQDispatchCmd cmd; memset(&cmd, 0, sizeof(CQDispatchCmd)); - CoreCoord physical_start = device->physical_core_from_logical_core(worker_core_range.start_coord, CoreType::WORKER); - CoreCoord physical_end = device->physical_core_from_logical_core(worker_core_range.end_coord, CoreType::WORKER); + CoreCoord physical_start = device->worker_core_from_logical_core(worker_core_range.start_coord); + CoreCoord physical_end = device->worker_core_from_logical_core(worker_core_range.end_coord); const uint32_t bank_id = 0; // No interleaved pages here. cmd.base.cmd_id = CQ_DISPATCH_CMD_WRITE_LINEAR; @@ -1119,8 +1113,8 @@ inline bool gen_rnd_dispatcher_packed_write_large_cmd( device_data.relevel(range); CQDispatchWritePackedLargeSubCmd sub_cmd; - CoreCoord physical_start = device->physical_core_from_logical_core(range.start_coord, CoreType::WORKER); - CoreCoord physical_end = device->physical_core_from_logical_core(range.end_coord, CoreType::WORKER); + CoreCoord physical_start = device->worker_core_from_logical_core(range.start_coord); + CoreCoord physical_end = device->worker_core_from_logical_core(range.end_coord); sub_cmd.noc_xy_addr = NOC_MULTICAST_ENCODING(physical_start.x, physical_start.y, physical_end.x, physical_end.y); sub_cmd.addr = device_data.get_result_data_addr(range.start_coord); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 030d3597b56d..2c1409ab608e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -219,7 +219,7 @@ int main(int argc, char** argv) { } break; case 2: { src_mem = test_write ? "TO_L1" : "FROM_L1"; - CoreCoord w = device->physical_core_from_logical_core(src_worker_g, CoreType::WORKER); + CoreCoord w = device->worker_core_from_logical_core(src_worker_g); noc_addr_x = w.x; noc_addr_y = w.y; } break; @@ -233,7 +233,7 @@ int main(int argc, char** argv) { case 4: { src_mem = "FROM_L1_TO_HOST"; log_info(LogTest, "Host bw test overriding page_count to 1"); - CoreCoord w = device->physical_core_from_logical_core(src_worker_g, CoreType::WORKER); + CoreCoord w = device->worker_core_from_logical_core(src_worker_g); page_count_g = 1; noc_addr_x = w.x; noc_addr_y = w.y; @@ -241,7 +241,7 @@ int main(int argc, char** argv) { case 5: { src_mem = "FROM_HOST_TO_L1"; log_info(LogTest, "Host bw test overriding page_count to 1"); - CoreCoord w = device->physical_core_from_logical_core(src_worker_g, CoreType::WORKER); + CoreCoord w = device->worker_core_from_logical_core(src_worker_g); page_count_g = 1; noc_addr_x = w.x; noc_addr_y = w.y; @@ -249,10 +249,8 @@ int main(int argc, char** argv) { case 6: { src_mem = "FROM_L1_TO_MCAST"; issue_mcast = 1; - CoreCoord start = - device->physical_core_from_logical_core(mcast_src_workers_g.start_coord, CoreType::WORKER); - CoreCoord end = - device->physical_core_from_logical_core(mcast_src_workers_g.end_coord, CoreType::WORKER); + CoreCoord start = device->worker_core_from_logical_core(mcast_src_workers_g.start_coord); + CoreCoord end = device->worker_core_from_logical_core(mcast_src_workers_g.end_coord); noc_addr_x = start.x; noc_addr_y = start.y; mcast_noc_addr_end_x = end.x; @@ -299,7 +297,7 @@ int main(int argc, char** argv) { std::shared_ptr sync_event = std::make_shared(); - CoreCoord w = device->physical_core_from_logical_core(worker_g.start_coord, CoreType::WORKER); + CoreCoord w = device->worker_core_from_logical_core(worker_g.start_coord); log_info(LogTest, "Master core: {}", w.str()); string direction = test_write ? "Writing" : "Reading"; if (source_mem_g == 3) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp index e0b477046414..d22dd0bb90ee 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp @@ -390,21 +390,12 @@ void initialize_dram_banks(Device* device) { auto fill = std::vector(bank_size / sizeof(uint32_t), 0xBADDF00D); for (int bank_id = 0; bank_id < num_banks; bank_id++) { - auto offset = device->bank_offset(BufferType::DRAM, bank_id); - auto dram_channel = device->dram_channel_from_bank_id(bank_id); - auto bank_core = device->dram_core_from_dram_channel(dram_channel); log_info( tt::LogTest, - "Initializing DRAM {} bytes for bank_id: {} core: {} at addr: 0x{:x}", + "Initializing DRAM {} bytes for bank_id: {}", bank_size, - bank_id, - bank_core, - offset); - tt::Cluster::instance().write_core( - static_cast(fill.data()), - fill.size() * sizeof(uint32_t), - tt_cxy_pair(device->id(), bank_core), - offset); + bank_id); + tt::tt_metal::detail::WriteToDeviceDRAMChannel(device, bank_id, 0, fill); } } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index 5d482863c55f..1b6748e7768f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -300,13 +300,12 @@ void add_prefetcher_paged_read_cmd( add_bare_prefetcher_cmd(cmds, cmd, true); } -void add_prefetcher_linear_read_cmd( - Device* device, - vector& cmds, - vector& sizes, - CoreCoord worker_core, - uint32_t addr, - uint32_t length) { +void add_prefetcher_linear_read_cmd(Device *device, + vector& cmds, + vector& sizes, + CoreCoord worker_core, + uint32_t addr, + uint32_t length) { CoreCoord phys_worker_core = device->worker_core_from_logical_core(worker_core); CQPrefetchCmd cmd; @@ -444,7 +443,7 @@ void add_paged_dram_data_to_device_data( for (uint32_t page_idx = start_page; page_idx < last_page; page_idx++) { uint32_t dram_bank_id = page_idx % num_dram_banks_g; auto dram_channel = device->dram_channel_from_bank_id(dram_bank_id); - CoreCoord bank_core = device->dram_core_from_dram_channel(dram_channel); + CoreCoord bank_core = device->logical_core_from_dram_channel(dram_channel); uint32_t bank_offset = base_addr_words + page_size_words * (page_idx / num_dram_banks_g); if (page_idx == last_page - 1) { @@ -500,7 +499,7 @@ void gen_dram_packed_read_cmd( for (uint32_t i = 0; i < length_words; i += page_size_words) { uint32_t dram_bank_id = page_idx % num_dram_banks_g; auto dram_channel = device->dram_channel_from_bank_id(dram_bank_id); - CoreCoord bank_core = device->dram_core_from_dram_channel(dram_channel); + CoreCoord bank_core = device->logical_core_from_dram_channel(dram_channel); uint32_t bank_offset = base_addr_words + page_size_words * (page_idx / num_dram_banks_g); uint32_t words = (page_size_words > length_words - i) ? length_words - i : page_size_words; @@ -1050,15 +1049,8 @@ void gen_prefetcher_exec_buf_cmd_and_write_to_dram( uint32_t index = 0; for (uint32_t page_id = 0; page_id < pages; page_id++) { uint32_t bank_id = page_id % num_dram_banks_g; - auto offset = device->bank_offset(BufferType::DRAM, bank_id); - auto dram_channel = device->dram_channel_from_bank_id(bank_id); - auto bank_core = device->dram_core_from_dram_channel(dram_channel); - - tt::Cluster::instance().write_core( - static_cast(&exec_buf_cmds[index / sizeof(uint32_t)]), - page_size, - tt_cxy_pair(device->id(), bank_core), - DRAM_EXEC_BUF_DEFAULT_BASE_ADDR + offset + (page_id / num_dram_banks_g) * page_size); + std::vector exec_buf_page(exec_buf_cmds.begin() + index / sizeof(uint32_t), exec_buf_cmds.begin() + (index + page_size) / sizeof(uint32_t)); + tt::tt_metal::detail::WriteToDeviceDRAMChannel(device, bank_id, DRAM_EXEC_BUF_DEFAULT_BASE_ADDR + (page_id / num_dram_banks_g) * page_size, exec_buf_page); index += page_size; } @@ -1629,22 +1621,8 @@ void initialize_dram_banks(Device* device) { auto fill = std::vector(bank_size / sizeof(uint32_t), 0xBADDF00D); for (int bank_id = 0; bank_id < num_banks; bank_id++) { - auto offset = device->bank_offset(BufferType::DRAM, bank_id); - auto dram_channel = device->dram_channel_from_bank_id(bank_id); - auto bank_core = device->dram_core_from_dram_channel(dram_channel); - - log_info( - tt::LogTest, - "Initializing DRAM {} bytes for bank_id: {} core: {} at addr: 0x{:x}", - bank_size, - bank_id, - bank_core.str(), - offset); - tt::Cluster::instance().write_core( - static_cast(fill.data()), - fill.size() * sizeof(uint32_t), - tt_cxy_pair(device->id(), bank_core), - offset); + log_info(tt::LogTest, "Initializing DRAM {} bytes for bank_id: {}", bank_size, bank_id); + tt::tt_metal::detail::WriteToDeviceDRAMChannel(device, bank_id, 0, fill); } } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp index 3123ea1736ae..476e9890797a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp @@ -224,8 +224,8 @@ void build_and_run_roundtrip_latency_test( sample_page_size, eth_sender_core, receiver_start_semaphore, - device->physical_core_from_logical_core(init_worker_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(init_worker_core, CoreType::WORKER).y, + device->virtual_core_from_logical_core(init_worker_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(init_worker_core, CoreType::WORKER).y, worker_sem0); std::vector const& sender_eth_rt_args = get_eth_sender_rt_args( device, @@ -233,15 +233,15 @@ void build_and_run_roundtrip_latency_test( num_samples, max_concurrent_samples, sample_page_size, - device->physical_core_from_logical_core(init_worker_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(init_worker_core, CoreType::WORKER).y, + device->virtual_core_from_logical_core(init_worker_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(init_worker_core, CoreType::WORKER).y, worker_sem1); std::vector worker_init_rt_args = { worker_sem0, worker_sem1, - static_cast(device->physical_core_from_logical_core(eth_receiver_core, CoreType::ETH).x), - static_cast(device->physical_core_from_logical_core(eth_receiver_core, CoreType::ETH).y), + static_cast(device->virtual_core_from_logical_core(eth_receiver_core, CoreType::ETH).x), + static_cast(device->virtual_core_from_logical_core(eth_receiver_core, CoreType::ETH).y), receiver_start_semaphore}; auto receiver_kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp index d8d4896badf9..ccce74010490 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp @@ -253,7 +253,7 @@ bool RunWriteBWTest( for (uint32_t w = 0; w < chip0_num_workers_on_channel; w++) { // 10) worker_coord(s) auto worker_noc_coord = - sender_device->physical_core_from_logical_core(chip0_sender_worker_core, CoreType::WORKER); + sender_device->virtual_core_from_logical_core(chip0_sender_worker_core, CoreType::WORKER); chip0_edm_args.push_back( KernelXY{static_cast(worker_noc_coord.x), static_cast(worker_noc_coord.y)} .to_uint32()); @@ -400,7 +400,7 @@ bool RunWriteBWTest( for (uint32_t w = 0; w < chip1_num_workers_on_channel; w++) { // 10) worker_coord(s) auto worker_noc_coord = - receiver_device->physical_core_from_logical_core(chip1_sender_noc_xy, CoreType::WORKER); + receiver_device->virtual_core_from_logical_core(chip1_sender_noc_xy, CoreType::WORKER); chip1_edm_args.push_back( KernelXY{static_cast(worker_noc_coord.x), static_cast(worker_noc_coord.y)} .to_uint32()); @@ -448,7 +448,7 @@ bool RunWriteBWTest( for (uint32_t w = 0; w < chip1_num_workers_on_channel; w++) { // 10) worker_coord(s) auto worker_noc_coord = - receiver_device->physical_core_from_logical_core(chip1_receiver_worker_core, CoreType::WORKER); + receiver_device->virtual_core_from_logical_core(chip1_receiver_worker_core, CoreType::WORKER); chip1_edm_args.push_back( KernelXY{static_cast(worker_noc_coord.x), static_cast(worker_noc_coord.y)} .to_uint32()); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp index 87db470f9a6e..afbd7ae3038f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp @@ -221,28 +221,29 @@ int main(int argc, char** argv) { std::vector tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -454,28 +455,29 @@ int main(int argc, char** argv) { std::vector rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i + 1, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i + 1, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp index 694919bf9a7a..547472f20f13 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp @@ -216,28 +216,29 @@ int main(int argc, char** argv) { // tx on left chip std::vector l_tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; l_tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (l_to_r) { log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -255,28 +256,29 @@ int main(int argc, char** argv) { // tx on right chip std::vector r_tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; r_tx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (r_to_l) { log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -441,28 +443,29 @@ int main(int argc, char** argv) { // Rx Right std::vector r_rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; r_rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (l_to_r) { log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -480,28 +483,29 @@ int main(int argc, char** argv) { // Rx Left std::vector l_rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; l_rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (r_to_l) { log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp index 6aa65a7b28a6..5562d50100d7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp @@ -216,28 +216,29 @@ int main(int argc, char** argv) { // tx on left chip std::vector l_tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; l_tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (l_to_r) { log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -255,28 +256,29 @@ int main(int argc, char** argv) { // tx on right chip std::vector r_tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; r_tx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (r_to_l) { log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -449,28 +451,29 @@ int main(int argc, char** argv) { // Rx Right std::vector r_rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; r_rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (l_to_r) { log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -488,28 +491,29 @@ int main(int argc, char** argv) { // Rx Left std::vector l_rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; l_rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (r_to_l) { log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp index 552e32a31df0..b48b418822a5 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp @@ -213,28 +213,29 @@ int main(int argc, char** argv) { std::vector tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -376,28 +377,29 @@ int main(int argc, char** argv) { std::vector rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i + 1, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i + 1, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp index 570b9ed24870..6c084f08576d 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp @@ -207,28 +207,29 @@ int main(int argc, char** argv) { std::vector tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -366,28 +367,29 @@ int main(int argc, char** argv) { std::vector rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp index 9796de5d1b4e..e0f9241b57f0 100644 --- a/tests/tt_metal/tt_metal/test_bcast.cpp +++ b/tests/tt_metal/tt_metal/test_bcast.cpp @@ -124,8 +124,6 @@ int main(int argc, char** argv) { uint32_t dram_buffer_src0_addr = src0_dram_buffer->address(); auto dst_dram_buffer = CreateBuffer(buff_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); uint32_t src0_cb_index = 0; uint32_t num_buffer_tiles = 2; @@ -238,7 +236,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(src1_config); uint32_t dram_buffer_src1_addr = src1_dram_buffer->address(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); tt_metal::detail::WriteToBuffer(src1_dram_buffer, bcast_tiled_u32); bool src0_is_dram = true; @@ -268,28 +265,20 @@ int main(int argc, char** argv) { program, binary_reader_kernel, core, - {dram_buffer_src0_addr, // 0 - (std::uint32_t)dram_src0_noc_xy.x, // 1 - (std::uint32_t)dram_src0_noc_xy.y, // 2 - num_tensor_tiles, // 3 - dram_buffer_src1_addr, // 4 - (std::uint32_t)dram_src1_noc_xy.x, // 5 - (std::uint32_t)dram_src1_noc_xy.y, // 6 + {dram_buffer_src0_addr, // 0 + (std::uint32_t)0, // 1 + num_tensor_tiles, // 2 + dram_buffer_src1_addr, // 3 + (std::uint32_t)0, // 4 num_bcast_tiles, NC * Ht * Wt, NC, Ht, Wt, - nc1}); // 7 8 9 10 11 12 + nc1}); // 5 6 7 8 9 10 tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tensor_tiles}); + program, unary_writer_kernel, core, {dram_buffer_dst_addr, (std::uint32_t)0, num_tensor_tiles}); std::map compute_defines = { {"BCAST_DIM", bdim_to_llkdim_define[bcast_dim]}, diff --git a/tests/tt_metal/tt_metal/test_clean_init.cpp b/tests/tt_metal/tt_metal/test_clean_init.cpp index 5f72467a2ef2..cd3118ec3e89 100644 --- a/tests/tt_metal/tt_metal/test_clean_init.cpp +++ b/tests/tt_metal/tt_metal/test_clean_init.cpp @@ -44,13 +44,13 @@ int main(int argc, char** argv) { for (int device_id = 0; device_id < num_devices; device_id++) { try { /* - * Silicon accelerator setup - */ - Device* device = devices[device_id]; + * Silicon accelerator setup + */ + Device *device = devices[device_id]; /* - * Setup program and command queue to execute along with its buffers and kernels to use - */ + * Setup program and command queue to execute along with its buffers and kernels to use + */ CommandQueue& cq = device->command_queue(); Program program = CreateProgram(); @@ -60,22 +60,25 @@ int main(int argc, char** argv) { program, "tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default} + ); constexpr uint32_t single_tile_size = 2 * (32 * 32); constexpr uint32_t num_tiles = 50; constexpr uint32_t dram_buffer_size = single_tile_size * num_tiles; tt::tt_metal::InterleavedBufferConfig dram_config{ - .device = device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = tt::tt_metal::BufferType::DRAM}; + .device= device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt::tt_metal::BufferType::DRAM + }; tt::tt_metal::InterleavedBufferConfig l1_config{ - .device = device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = tt::tt_metal::BufferType::L1}; + .device= device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt::tt_metal::BufferType::L1 + }; auto l1_buffer = CreateBuffer(l1_config); @@ -86,8 +89,8 @@ int main(int argc, char** argv) { const uint32_t output_dram_buffer_addr = output_dram_buffer->address(); /* - * Create input data and runtime arguments, then execute - */ + * Create input data and runtime arguments, then execute + */ std::vector input_vec = create_random_vector_of_bfloat16( dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); EnqueueWriteBuffer(cq, input_dram_buffer, input_vec, false); @@ -95,14 +98,18 @@ int main(int argc, char** argv) { const std::array runtime_args = { l1_buffer->address(), input_dram_buffer->address(), - static_cast(input_dram_buffer->noc_coordinates().x), - static_cast(input_dram_buffer->noc_coordinates().y), + 0, output_dram_buffer->address(), - static_cast(output_dram_buffer->noc_coordinates().x), - static_cast(output_dram_buffer->noc_coordinates().y), - l1_buffer->size()}; + 0, + l1_buffer->size() + }; - SetRuntimeArgs(program, dram_copy_kernel_id, core, runtime_args); + SetRuntimeArgs( + program, + dram_copy_kernel_id, + core, + runtime_args + ); EnqueueProgram(cq, program, false); tt::log_info("Started program"); @@ -110,14 +117,14 @@ int main(int argc, char** argv) { tt::log_info("Finished program"); /* - * Validation & Teardown - */ + * Validation & Teardown + */ std::vector result_vec; EnqueueReadBuffer(cq, output_dram_buffer, result_vec, true); pass &= input_vec == result_vec; - } catch (const std::exception& e) { + } catch (const std::exception &e) { tt::log_error(tt::LogTest, "Test failed with exception!"); tt::log_error(tt::LogTest, "{}", e.what()); diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 5a543ff644d3..6a338ac73580 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -58,8 +58,6 @@ void construct_program(Program& program, Device* device, CoreCoord& core) { auto dst_dram_buffer = CreateBuffer(buff_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the // input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp index 5ebac48ca2dd..0dc99f6c46b1 100644 --- a/tests/tt_metal/tt_metal/test_core_range_set.cpp +++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp @@ -92,7 +92,6 @@ bool test_program_specified_with_core_range_set( .device = device, .size = buffer_size, .page_size = buffer_size, .buffer_type = tt_metal::BufferType::DRAM}; auto src_dram_buffer = CreateBuffer(dram_config); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); std::map> core_to_l1_buffer; for (auto core_range : core_range_set.ranges()) { @@ -139,7 +138,7 @@ bool test_program_specified_with_core_range_set( auto unary_writer_kernel = tt_metal::CreateKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tt_metal/kernels/dataflow/writer_unary_1.cpp", core_range_set, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -172,12 +171,14 @@ bool test_program_specified_with_core_range_set( // Reader kernel on all cores reads from same location in DRAM const std::array reader_rt_args = { - src_dram_buffer->address(), (std::uint32_t)dram_src_noc_xy.x, (std::uint32_t)dram_src_noc_xy.y, num_tiles}; - + src_dram_buffer->address(), uint(0), num_tiles}; for (const auto& [core, dst_l1_buffer] : core_to_l1_buffer) { tt_metal::SetRuntimeArgs(program, unary_reader_kernel, core, reader_rt_args); - auto l1_dst_noc_xy = dst_l1_buffer->noc_coordinates(); + auto bank_id = 0; + auto l1_dst_noc_xy = + device->virtual_core_from_logical_core(dst_l1_buffer->logical_core_from_bank_id(0), CoreType::WORKER); + tt_metal::SetRuntimeArgs( program, unary_writer_kernel, diff --git a/tests/tt_metal/tt_metal/test_datacopy.cpp b/tests/tt_metal/tt_metal/test_datacopy.cpp index 9caaced9a95b..92f31b304660 100644 --- a/tests/tt_metal/tt_metal/test_datacopy.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy.cpp @@ -60,9 +60,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math // kernel, input CB and reader @@ -118,13 +115,17 @@ int main(int argc, char** argv) { program, unary_reader_kernel, core, - {dram_buffer_src_addr, (std::uint32_t)dram_src_noc_xy.x, (std::uint32_t)dram_src_noc_xy.y, num_tiles}); + {dram_buffer_src_addr, + 0, + num_tiles}); tt_metal::SetRuntimeArgs( program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + {dram_buffer_dst_addr, + 0, + num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp index 329cbbf4d877..ad11169d38a1 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp @@ -53,9 +53,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 1; tt_metal::CircularBufferConfig cb_src0_config = @@ -114,13 +111,17 @@ int main(int argc, char** argv) { program, unary_reader_kernel, core, - {dram_buffer_src_addr, (std::uint32_t)dram_src_noc_xy.x, (std::uint32_t)dram_src_noc_xy.y, num_tiles}); + {dram_buffer_src_addr, + 0, + num_tiles}); tt_metal::SetRuntimeArgs( program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + {dram_buffer_dst_addr, + 0, + num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp index a352719772b3..7156cc15c030 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp @@ -60,8 +60,8 @@ int main(int argc, char** argv) { auto dst_l1_buffer = CreateBuffer(l1_config); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto l1_dst_noc_xy = dst_l1_buffer->noc_coordinates(); + auto l1_dst_noc_xy = + device->virtual_core_from_logical_core(dst_l1_buffer->logical_core_from_bank_id(0), CoreType::WORKER); // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math @@ -91,7 +91,7 @@ int main(int argc, char** argv) { auto unary_writer_kernel = tt_metal::CreateKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tt_metal/kernels/dataflow/writer_unary_1.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -122,9 +122,8 @@ int main(int argc, char** argv) { unary_reader_kernel, core, {src_dram_buffer->address(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles}); + 0, + num_tiles}); tt_metal::SetRuntimeArgs( program, diff --git a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp index 969d5ebe6599..277424bfaac9 100644 --- a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp +++ b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp @@ -51,9 +51,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - int num_cbs = 1; // works at the moment assert(num_tiles % num_cbs == 0); int num_tiles_per_cb = num_tiles / num_cbs; @@ -116,18 +113,16 @@ int main(int argc, char** argv) { reader_cb_kernel, core, {dram_buffer_src_addr, - (uint32_t)dram_src_noc_xy.x, - (uint32_t)dram_src_noc_xy.y, - (uint32_t)num_tiles_per_cb}); + 0, + (uint32_t)num_tiles_per_cb}); tt_metal::SetRuntimeArgs( program, writer_cb_kernel, core, {dram_buffer_dst_addr, - (uint32_t)dram_dst_noc_xy.x, - (uint32_t)dram_dst_noc_xy.y, - (uint32_t)num_tiles_per_cb}); + 0, + (uint32_t)num_tiles_per_cb}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp index 6ffc8727174f..a4700ecbe0b6 100644 --- a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp @@ -64,7 +64,6 @@ int main(int argc, char** argv) { auto src_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); assert(src_dram_buffer->size() % (num_cores_r * num_cores_c) == 0); uint32_t per_core_l1_size = src_dram_buffer->size() / (num_cores_r * num_cores_c); std::unordered_map core_to_l1_addr; @@ -108,11 +107,10 @@ int main(int argc, char** argv) { unary_reader_kernel, core, {core_to_l1_addr.at(core), - dram_buffer_src_addr + (core_index * stick_size), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - (std::uint32_t)1, - (std::uint32_t)stick_size}); + dram_buffer_src_addr + (core_index * stick_size), + 0, + (std::uint32_t) 1, + (std::uint32_t) stick_size}); core_index++; } } diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp index fb7f2eac3c83..53c268c30800 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp @@ -80,9 +80,6 @@ int main(int argc, char** argv) { auto output_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - // Loader (producer kernel) running on BRISC on logical core {0, 0} auto producer_kernel = tt_metal::CreateKernel( program, @@ -113,31 +110,29 @@ int main(int argc, char** argv) { producer_kernel, loader_logical_core, {dram_buffer_src_addr, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, - loader_buffer_address, - (uint32_t)writer_worker_core.x, - (uint32_t)writer_worker_core.y, - stream_register_address, - num_output_tiles, - transient_buffer_size_tiles, - transient_buffer_size_bytes}); + 0, + loader_buffer_address, + (uint32_t)writer_worker_core.x, + (uint32_t)writer_worker_core.y, + stream_register_address, + num_output_tiles, + transient_buffer_size_tiles, + transient_buffer_size_bytes}); tt_metal::SetRuntimeArgs( program, consumer_kernel, writer_logical_core, {loader_buffer_address, - (uint32_t)loader_worker_core.x, - (uint32_t)loader_worker_core.y, - dram_buffer_dst_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, - writer_buffer_address, - stream_register_address, - num_output_tiles, - transient_buffer_size_tiles, - transient_buffer_size_bytes}); + (uint32_t)loader_worker_core.x, + (uint32_t)loader_worker_core.y, + dram_buffer_dst_addr, + 0, + writer_buffer_address, + stream_register_address, + num_output_tiles, + transient_buffer_size_tiles, + transient_buffer_size_bytes}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp index a8a1b47ea5ac..a432cfb39d6c 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp @@ -100,9 +100,6 @@ int main(int argc, char** argv) { // auto output_dram_buffer = tt_metal::CreateDramBuffer(device, dram_channel_id, dram_buffer_size, // dram_buffer_dst_addr); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - // Loader (producer kernel) running on BRISC on logical core {0, 0} auto producer_kernel = tt_metal::CreateKernel( program, @@ -133,36 +130,36 @@ int main(int argc, char** argv) { producer_kernel, loader_logical_core, {dram_buffer_src_addr, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, - loader_buffer_address1, - loader_buffer_address2, - (uint32_t)writer_worker_core.x, - (uint32_t)writer_worker_core.y, - stream_register_address1, - stream_register_address2, - num_output_tiles, - transient_buffer_size_tiles, - transient_buffer_size_bytes}); + 0, + loader_buffer_address1, + loader_buffer_address2, + (uint32_t)writer_worker_core.x, + (uint32_t)writer_worker_core.y, + stream_register_address1, + stream_register_address2, + num_output_tiles, + transient_buffer_size_tiles, + transient_buffer_size_bytes} + ); tt_metal::SetRuntimeArgs( program, consumer_kernel, writer_logical_core, {loader_buffer_address1, - loader_buffer_address2, - (uint32_t)loader_worker_core.x, - (uint32_t)loader_worker_core.y, - dram_buffer_dst_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, - writer_buffer_address1, - writer_buffer_address2, - stream_register_address1, - stream_register_address2, - num_output_tiles, - transient_buffer_size_tiles, - transient_buffer_size_bytes}); + loader_buffer_address2, + (uint32_t)loader_worker_core.x, + (uint32_t)loader_worker_core.y, + dram_buffer_dst_addr, + 0, + writer_buffer_address1, + writer_buffer_address2, + stream_register_address1, + stream_register_address2, + num_output_tiles, + transient_buffer_size_tiles, + transient_buffer_size_bytes} + ); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp index eee7394ae90d..f4dc47278ed2 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp @@ -54,9 +54,6 @@ int main(int argc, char** argv) { auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - auto dram_copy_kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", @@ -80,13 +77,11 @@ int main(int argc, char** argv) { dram_copy_kernel, core, {l1_buffer_addr, - input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - dram_buffer_size}); + input_dram_buffer_addr, + 0, + output_dram_buffer_addr, + 0, + dram_buffer_size}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp index ccbe3b14287d..261c1611a1f2 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp @@ -57,9 +57,6 @@ int main(int argc, char** argv) { auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - auto dram_copy_kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp", @@ -83,16 +80,14 @@ int main(int argc, char** argv) { dram_copy_kernel, core, {input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - dram_buffer_size_bytes, - num_tiles, - l1_buffer_addr, - total_l1_buffer_size_tiles, - total_l1_buffer_size_bytes}); + 0, + output_dram_buffer_addr, + 0, + dram_buffer_size_bytes, + num_tiles, + l1_buffer_addr, + total_l1_buffer_size_tiles, + total_l1_buffer_size_bytes}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp index a7cfe0466e24..76df1018d741 100644 --- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp @@ -55,8 +55,6 @@ int main(int argc, char** argv) { auto dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_addr = dram_buffer->address(); - auto dram_noc_xy = dram_buffer->noc_coordinates(); - CoreCoord core_start = {0, 0}; CoreCoord grid_size = device->logical_grid_size(); CoreCoord core_end = {core_start.x + (grid_size.x - 1), core_start.y + (grid_size.y - 1)}; @@ -64,8 +62,7 @@ int main(int argc, char** argv) { auto core_end_physical = device->worker_core_from_logical_core(core_end); const std::array mcast_reader_args = { (std::uint32_t)dram_buffer_addr, - (std::uint32_t)dram_noc_xy.x, - (std::uint32_t)dram_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)dram_buffer_size, (std::uint32_t)local_buffer_addr, (std::uint32_t)dest_buffer_addr, diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp index 28030b22d97c..767218752f9a 100644 --- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp +++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp @@ -51,7 +51,6 @@ int main(int argc, char** argv) { .buffer_type = tt_metal::BufferType::DRAM}; auto dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_addr = dram_buffer->address(); - auto dram_noc_xy = dram_buffer->noc_coordinates(); CoreCoord core_start = {0, 0}; CoreCoord grid_size = device->logical_grid_size(); @@ -60,8 +59,7 @@ int main(int argc, char** argv) { auto core_end_physical = device->worker_core_from_logical_core(core_end); const std::array mcast_reader_args = { (std::uint32_t)dram_buffer_addr, - (std::uint32_t)dram_noc_xy.x, - (std::uint32_t)dram_noc_xy.y, + 0, (std::uint32_t)dram_buffer_size, (std::uint32_t)local_buffer_addr, (std::uint32_t)dest_buffer_addr, diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp index b6d0d8564a62..60cdf9df7e44 100644 --- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp @@ -77,10 +77,6 @@ int main(int argc, char** argv) { uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = tt::CBIndex::c_0; uint32_t num_input_tiles = 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -158,19 +154,17 @@ int main(int argc, char** argv) { EnqueueWriteBuffer(cq, std::ref(src1_dram_buffer), src1_vec, false); - const std::array reader_args = { + const std::array reader_args = { dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + 0, num_tiles, dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + 0, num_tiles, 0}; - const std::array writer_args = { - dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}; + const std::array writer_args = { + dram_buffer_dst_addr, 0, num_tiles}; SetRuntimeArgs(program, unary_writer_kernel, core, writer_args); SetRuntimeArgs(program, binary_reader_kernel, core, reader_args); diff --git a/tests/tt_metal/tt_metal/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/test_enqueue_program.cpp index d31f07dee625..d685a0443028 100644 --- a/tests/tt_metal/tt_metal/test_enqueue_program.cpp +++ b/tests/tt_metal/tt_metal/test_enqueue_program.cpp @@ -35,9 +35,6 @@ tt_metal::Program generate_eltwise_unary_program(Device* device) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 2; tt_metal::CircularBufferConfig src_cb_config = @@ -99,9 +96,9 @@ void test_enqueue_program(std::functionriscv0_id.value(), worker_core, {out.address(), 0, 0, NUM_TILES}); - SetRuntimeArgs(program, kernel_group->riscv1_id.value(), worker_core, {buf.address(), 0, 0, NUM_TILES}); + const KernelGroup *kernel_group = program.kernels_on_core(worker_core, CoreType::WORKER); + SetRuntimeArgs(program, kernel_group->riscv0_id.value(), worker_core, {out.address(), 0, NUM_TILES}); + SetRuntimeArgs(program, kernel_group->riscv1_id.value(), worker_core, {buf.address(), 0, NUM_TILES}); EnqueueWriteBuffer(cq, std::ref(buf), inp, false); EnqueueProgram(cq, program, false); diff --git a/tests/tt_metal/tt_metal/test_flatten.cpp b/tests/tt_metal/tt_metal/test_flatten.cpp index 27353d8e45e5..508d5429efc1 100644 --- a/tests/tt_metal/tt_metal/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/test_flatten.cpp @@ -104,8 +104,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math @@ -168,17 +166,18 @@ int main(int argc, char** argv) { flatten_kernel, core, {dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles_r, - num_tiles_c, - num_bytes_per_tensor_row}); + 0, + num_tiles_r, + num_tiles_c, + num_bytes_per_tensor_row}); tt_metal::SetRuntimeArgs( program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles * 32}); + {dram_buffer_dst_addr, + 0, + num_tiles * 32}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp index f1beb8c21e88..b42e28099cfe 100644 --- a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp @@ -185,10 +185,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * in0_block_w * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -247,11 +243,9 @@ int main(int argc, char** argv) { const std::array generic_binary_reader_args{ src0_dram_buffer->address(), - (uint32_t)dram_src0_noc_xy.x, - (uint32_t)dram_src0_noc_xy.y, + (uint32_t) 0, src1_dram_buffer->address(), - (uint32_t)dram_src1_noc_xy.x, - (uint32_t)dram_src1_noc_xy.y, + (uint32_t) 0, (uint32_t)source_addresses.size(), (uint32_t)source_addresses_in_l1_addr, (uint32_t)num_blocks, @@ -270,17 +264,14 @@ int main(int argc, char** argv) { const std::array writer_rt_args{ dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - (std::uint32_t)out_subblock_h, // num tiles per sub block m - (std::uint32_t)out_subblock_w, // num tiles per sub block n - (std::uint32_t)M / out_subblock_h, // num sub blocks m - (std::uint32_t)N / out_subblock_w, // num sub blocks n - (std::uint32_t)out_subblock_w * single_tile_size * - (N / out_subblock_w), // bytes offset to next row within sub-block - (std::uint32_t)out_subblock_h * out_subblock_w * single_tile_size * - (N / out_subblock_w), // bytes offset to next row of sub-blocks - (std::uint32_t)out_subblock_w * single_tile_size}; // bytes offset to next sub-block + (std::uint32_t) 0, + (std::uint32_t)out_subblock_h, // num tiles per sub block m + (std::uint32_t)out_subblock_w, // num tiles per sub block n + (std::uint32_t)M/out_subblock_h, // num sub blocks m + (std::uint32_t)N/out_subblock_w, // num sub blocks n + (std::uint32_t)out_subblock_w * single_tile_size * (N/out_subblock_w), // bytes offset to next row within sub-block + (std::uint32_t)out_subblock_h * out_subblock_w * single_tile_size * (N/out_subblock_w), // bytes offset to next row of sub-blocks + (std::uint32_t)out_subblock_w*single_tile_size}; // bytes offset to next sub-block auto unary_writer_kernel = tt_metal::CreateKernel( program, diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp index 749ec5c2c19f..7d5a5e632d66 100644 --- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp +++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp @@ -121,8 +121,6 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt: auto dst_dram_buffer = CreateBuffer(dst_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math // kernel, input CB and reader @@ -186,9 +184,8 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt: unary_writer_kernel, core, {dram_buffer_dst_addr, - (uint32_t)dram_dst_noc_xy.x, - (uint32_t)dram_dst_noc_xy.y, - (uint32_t)num_output_tiles}); + (uint32_t) 0, + (uint32_t) num_output_tiles}); CoreCoord debug_core = {1, 1}; @@ -288,8 +285,6 @@ bool interleaved_tilized_reader_interleaved_stick_writer_datacopy_test(const tt: auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math // kernel, input CB and reader @@ -475,7 +470,11 @@ bool test_interleaved_l1_datacopy(const tt::ARCH& arch) { if constexpr (dst_is_in_l1) { dst = CreateBuffer(l1_config); - tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dst->address(), 0, 0, num_pages}); + tt_metal::SetRuntimeArgs( + program, + unary_writer_kernel, + core, + {dst->address(), 0, num_pages}); tt_metal::detail::LaunchProgram(device, program); @@ -484,7 +483,11 @@ bool test_interleaved_l1_datacopy(const tt::ARCH& arch) { } else { dst = CreateBuffer(dram_config); - tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dst->address(), 0, 0, num_pages}); + tt_metal::SetRuntimeArgs( + program, + unary_writer_kernel, + core, + {dst->address(), 0, num_pages}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp index 60105fd134cc..d3d7774760b3 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp @@ -9,13 +9,12 @@ void kernel_main() { const uint32_t in0_cb = get_compile_time_arg_val(0); const uint32_t in1_cb = get_compile_time_arg_val(1); + uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_tiles = get_arg_val(6); + uint32_t src0_dram_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_dram_bank_id = get_arg_val(3); + uint32_t num_tiles = get_arg_val(4); // single-tile ublocks uint32_t ublock_size_bytes_0 = get_tile_size(in0_cb); @@ -27,8 +26,8 @@ void kernel_main() { // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_dram_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_dram_bank_id, src1_addr); cb_reserve_back(in0_cb, ublock_size_tiles); cb_reserve_back(in1_cb, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp index 4beec5f136e9..4987633d1b9b 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp @@ -9,24 +9,23 @@ void kernel_main() { const uint32_t in0_cb = get_compile_time_arg_val(0); const uint32_t in1_cb = get_compile_time_arg_val(1); + uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_blocks = get_arg_val(6); - uint32_t in0_block_tile_cnt = get_arg_val(7); - uint32_t in1_block_tile_cnt = get_arg_val(8); - uint32_t in0_block_size_bytes = get_arg_val(9); - uint32_t in1_block_size_bytes = get_arg_val(10); + uint32_t src0_dram_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_dram_bank_id = get_arg_val(3); + uint32_t num_blocks = get_arg_val(4); + uint32_t in0_block_tile_cnt = get_arg_val(5); + uint32_t in1_block_tile_cnt = get_arg_val(6); + uint32_t in0_block_size_bytes = get_arg_val(7); + uint32_t in1_block_size_bytes = get_arg_val(8); uint32_t l1_write_addr_in0; uint32_t l1_write_addr_in1; for (uint32_t i = 0; i < num_blocks; i++) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_dram_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_dram_bank_id, src1_addr); cb_reserve_back(in0_cb, in0_block_tile_cnt); cb_reserve_back(in1_cb, in1_block_tile_cnt); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp index 3caec0ae5675..9efe017fd3f0 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp @@ -7,16 +7,15 @@ void kernel_main() { const uint32_t out_cb = get_compile_time_arg_val(0); uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t dst_dram_bank_id_addr = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); // single-tile ublocks uint32_t ublock_size_bytes = get_tile_size(out_cb); uint32_t ublock_size_tiles = 1; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_dram_bank_id_addr, dst_addr); cb_wait_front(out_cb, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(out_cb); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp index df99b8def697..4d2c3c3c4f0e 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp @@ -9,21 +9,24 @@ void kernel_main() { const uint32_t cb_id = get_compile_time_arg_val(0); uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); - + uint32_t src_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); +#if INTERFACE_WITH_L1 == 1 + constexpr bool read_from_dram = false; +#else + constexpr bool read_from_dram = true; +#endif // ublocks size defined in tiles constexpr uint32_t ublock_size_tiles = 1; uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; // read a ublock of tiles from src to CB, and then push the ublock to unpacker for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr); cb_reserve_back(cb_id, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id); - noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes); + noc_async_read(src_buffer_noc_addr, l1_write_addr, ublock_size_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp index 110fc2a85991..d89d10eae54a 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp @@ -7,21 +7,26 @@ void kernel_main() { const uint32_t cb_id = get_compile_time_arg_val(0); uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + +#if INTERFACE_WITH_L1 == 1 + constexpr bool write_to_dram = false; +#else + constexpr bool write_to_dram = true; +#endif // single-tile ublocks uint32_t ublock_size_bytes = get_tile_size(cb_id); uint32_t ublock_size_tiles = 1; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, dst_addr); cb_wait_front(cb_id, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id); - noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); + noc_async_write(l1_read_addr, dst_buffer_noc_addr, ublock_size_bytes); noc_async_write_barrier(); cb_pop_front(cb_id, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp index e0f12a23715a..fff8cc55a870 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp @@ -12,15 +12,13 @@ void kernel_main() { std::uint32_t l1_buffer_addr = get_arg_val(0); - std::uint32_t dram_buffer_src_addr = get_arg_val(1); - std::uint32_t dram_src_noc_x = get_arg_val(2); - std::uint32_t dram_src_noc_y = get_arg_val(3); + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t dram_src_bank_id = get_arg_val(2); - std::uint32_t dram_buffer_dst_addr = get_arg_val(4); - std::uint32_t dram_dst_noc_x = get_arg_val(5); - std::uint32_t dram_dst_noc_y = get_arg_val(6); + std::uint32_t dram_buffer_dst_addr = get_arg_val(3); + std::uint32_t dram_dst_bank_id = get_arg_val(4); - std::uint32_t dram_buffer_size = get_arg_val(7); + std::uint32_t dram_buffer_size = get_arg_val(5); #if defined(SIGNAL_COMPLETION_TO_DISPATCHER) // We will assert later. This kernel will hang. @@ -40,12 +38,12 @@ void kernel_main() { #endif // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_src_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, dram_buffer_size); noc_async_read_barrier(); // DRAM NOC dst address - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_dst_bank_id, dram_buffer_dst_addr); noc_async_write(l1_buffer_addr, dram_buffer_dst_noc_addr, dram_buffer_size); noc_async_write_barrier(); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp index b8a2756fbae0..4305e7de0eb8 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp @@ -11,20 +11,18 @@ * APIs explicit flushes need to be used since the calls are non-blocking * */ void kernel_main() { - std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t dram_src_bank_id = get_arg_val(1); - std::uint32_t dram_buffer_dst_addr_base = get_arg_val(3); - std::uint32_t dram_dst_noc_x = get_arg_val(4); - std::uint32_t dram_dst_noc_y = get_arg_val(5); + std::uint32_t dram_buffer_dst_addr_base = get_arg_val(2); + std::uint32_t dram_dst_bank_id = get_arg_val(3); - std::uint32_t dram_buffer_size = get_arg_val(6); - std::uint32_t num_tiles = get_arg_val(7); + std::uint32_t dram_buffer_size = get_arg_val(4); + std::uint32_t num_tiles = get_arg_val(5); - std::uint32_t l1_buffer_addr = get_arg_val(8); - std::uint32_t l1_buffer_size_tiles = get_arg_val(9); - std::uint32_t l1_buffer_size_bytes = get_arg_val(10); + std::uint32_t l1_buffer_addr = get_arg_val(6); + std::uint32_t l1_buffer_size_tiles = get_arg_val(7); + std::uint32_t l1_buffer_size_bytes = get_arg_val(8); std::uint32_t rd_wr_l1_buffer_size_tiles = l1_buffer_size_tiles / 2; std::uint32_t rd_wr_l1_buffer_size_bytes = l1_buffer_size_bytes / 2; @@ -41,7 +39,7 @@ void kernel_main() { std::uint32_t l1_addr2 = l1_buffer_addr + rd_wr_l1_buffer_size_bytes; // DRAM NOC src address - dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_src_bank_id, dram_buffer_src_addr); // Copy data from DRAM into destination L1 buffer noc_async_read(dram_buffer_src_noc_addr, l1_addr1, rd_wr_l1_buffer_size_bytes); @@ -50,9 +48,9 @@ void kernel_main() { while (num_tiles_read < num_tiles) { // DRAM NOC src address - dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_src_bank_id, dram_buffer_src_addr); // DRAM NOC dst address - dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_dst_bank_id, dram_buffer_dst_addr); noc_async_read(dram_buffer_src_noc_addr, l1_addr2, rd_wr_l1_buffer_size_bytes); dram_buffer_src_addr += rd_wr_l1_buffer_size_bytes; @@ -77,7 +75,7 @@ void kernel_main() { } // DRAM NOC dst address - dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_dst_bank_id, dram_buffer_dst_addr); noc_async_write(l1_addr2, dram_buffer_dst_noc_addr, rd_wr_l1_buffer_size_bytes); // Wait for all the writes to complete (ie acked) noc_async_write_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp index fec10af9f428..ce07d238d134 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp @@ -12,16 +12,15 @@ void kernel_main() { std::uint32_t l1_buffer_addr = get_arg_val(0); - std::uint32_t dram_buffer_src_addr = get_arg_val(1); - std::uint32_t dram_src_noc_x = get_arg_val(2); - std::uint32_t dram_src_noc_y = get_arg_val(3); + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t bank_id = get_arg_val(2); - std::uint32_t num_sticks = get_arg_val(4); - std::uint32_t stick_size = get_arg_val(5); - for (uint32_t i = 0; i < 1; i++) { - for (uint32_t stick_id = 0; stick_id < num_sticks; stick_id++) { + std::uint32_t num_sticks = get_arg_val(3); + std::uint32_t stick_size = get_arg_val(4); + for(uint32_t i = 0; i < 1; i++) { + for(uint32_t stick_id = 0; stick_id < num_sticks; stick_id++) { // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, stick_size); noc_async_read_barrier(); l1_buffer_addr += stick_size; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp new file mode 100644 index 000000000000..95c7566cbfbe --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or + * other RISCs Any two RISC processors cannot use the same CMD_BUF non_blocking APIs shouldn't be mixed with slow noc.h + * APIs explicit flushes need to be used since the calls are non-blocking + * */ +void kernel_main() { + std::uint32_t local_l1_buffer_addr = get_arg_val(0); + + std::uint32_t l1_buffer_src_addr = get_arg_val(1); + std::uint32_t l1_src_noc_x = get_arg_val(2); + std::uint32_t l1_src_noc_y = get_arg_val(3); + + std::uint32_t l1_buffer_dst_addr = get_arg_val(4); + std::uint32_t l1_dst_noc_x = get_arg_val(5); + std::uint32_t l1_dst_noc_y = get_arg_val(6); + + std::uint32_t l1_buffer_size = get_arg_val(7); + +#if defined(SIGNAL_COMPLETION_TO_DISPATCHER) + // We will assert later. This kernel will hang. + // Need to signal completion to dispatcher before hanging so that + // Dispatcher Kernel is able to finish. + // Device Close () requires fast dispatch kernels to finish. +#if defined(COMPILE_FOR_ERISC) + tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE); +#else + tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); +#endif + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31, false); +#endif + + // DRAM NOC src address + std::uint64_t l1_buffer_src_noc_addr = get_noc_addr(l1_src_noc_x, l1_src_noc_y, l1_buffer_src_addr); + noc_async_read(l1_buffer_src_noc_addr, local_l1_buffer_addr, l1_buffer_size); + noc_async_read_barrier(); + + // DRAM NOC dst address + std::uint64_t l1_buffer_dst_noc_addr = get_noc_addr(l1_dst_noc_x, l1_dst_noc_y, l1_buffer_dst_addr); + noc_async_write(local_l1_buffer_addr, l1_buffer_dst_noc_addr, l1_buffer_size); + noc_async_write_barrier(); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp index 02f7104aed63..ad3296e7fd4c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp @@ -12,16 +12,15 @@ constexpr static std::uint32_t VALID_VAL = 0x1234; constexpr static std::uint32_t INVALID_VAL = 0x4321; void kernel_main() { - std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t local_buffer_addr = get_arg_val(3); - std::uint32_t consumer_core_noc_x = get_arg_val(4); - std::uint32_t consumer_core_noc_y = get_arg_val(5); - std::uint32_t stream_register_address = get_arg_val(6); - std::uint32_t num_tiles = get_arg_val(7); - std::uint32_t transient_buffer_size_tiles = get_arg_val(8); - std::uint32_t transient_buffer_size_bytes = get_arg_val(9); + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t bank_id = get_arg_val(1); + std::uint32_t local_buffer_addr = get_arg_val(2); + std::uint32_t consumer_core_noc_x = get_arg_val(3); + std::uint32_t consumer_core_noc_y = get_arg_val(4); + std::uint32_t stream_register_address = get_arg_val(5); + std::uint32_t num_tiles = get_arg_val(6); + std::uint32_t transient_buffer_size_tiles = get_arg_val(7); + std::uint32_t transient_buffer_size_bytes = get_arg_val(8); // Scratch address in L1, to write register value before we copy it to into local/remote registers volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); @@ -35,7 +34,7 @@ void kernel_main() { std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; while (counter < num_tiles) { // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_src_addr); // Wait until sync register is INVALID_VAL (means its safe to corrupt destination buffer) wait_for_sync_register_value(stream_register_address, INVALID_VAL); // Copy data from dram into destination buffer diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp index fe07a8d15a06..22804b2bcd67 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp @@ -20,18 +20,17 @@ inline std::uint32_t ping_pong_address(std::uint32_t addr1, std::uint32_t addr2, } } void kernel_main() { - std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t local_buffer_addr1 = get_arg_val(3); - std::uint32_t local_buffer_addr2 = get_arg_val(4); - std::uint32_t consumer_core_noc_x = get_arg_val(5); - std::uint32_t consumer_core_noc_y = get_arg_val(6); - std::uint32_t stream_register_address1 = get_arg_val(7); - std::uint32_t stream_register_address2 = get_arg_val(8); - std::uint32_t num_tiles = get_arg_val(9); - std::uint32_t transient_buffer_size_tiles = get_arg_val(10); - std::uint32_t transient_buffer_size_bytes = get_arg_val(11); + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t bank_id = get_arg_val(1); + std::uint32_t local_buffer_addr1 = get_arg_val(2); + std::uint32_t local_buffer_addr2 = get_arg_val(3); + std::uint32_t consumer_core_noc_x = get_arg_val(4); + std::uint32_t consumer_core_noc_y = get_arg_val(5); + std::uint32_t stream_register_address1 = get_arg_val(6); + std::uint32_t stream_register_address2 = get_arg_val(7); + std::uint32_t num_tiles = get_arg_val(8); + std::uint32_t transient_buffer_size_tiles = get_arg_val(9); + std::uint32_t transient_buffer_size_bytes = get_arg_val(10); // Scratch address in L1, to write register value before we copy it to into local/remote registers volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); @@ -48,7 +47,7 @@ void kernel_main() { std::uint32_t local_buffer_address = ping_pong_address(local_buffer_addr1, local_buffer_addr2, counter); // DRAM NOC src address - dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_src_addr); // Wait until sync register is INVALID_VAL (means its safe to corrupt destination buffer) wait_for_sync_register_value(reg_addr, INVALID_VAL); // Copy data from dram into destination buffer diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp index 57ed1e1fa13d..60cb59310fc9 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp @@ -5,22 +5,21 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t src_buffer_size = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t src_buffer_size = get_arg_val(2); - uint32_t local_addr = get_arg_val(4); + uint32_t local_addr = get_arg_val(3); - uint32_t dst_addr = get_arg_val(5); - uint32_t dst_noc_x_start = get_arg_val(6); - uint32_t dst_noc_y_start = get_arg_val(7); - uint32_t dst_noc_x_end = get_arg_val(8); - uint32_t dst_noc_y_end = get_arg_val(9); - uint32_t num_dests = get_arg_val(10); + uint32_t dst_addr = get_arg_val(4); + uint32_t dst_noc_x_start = get_arg_val(5); + uint32_t dst_noc_y_start = get_arg_val(6); + uint32_t dst_noc_x_end = get_arg_val(7); + uint32_t dst_noc_y_end = get_arg_val(8); + uint32_t num_dests = get_arg_val(9); // Read src buffer into local L1 buffer - uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp index 376a6335d196..279977ce9737 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp @@ -1,3 +1,4 @@ + // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -5,26 +6,26 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t src_buffer_size = get_arg_val(3); - - uint32_t local_addr = get_arg_val(4); - - uint32_t dst_addr = get_arg_val(5); - uint32_t dst_noc_x_start = get_arg_val(6); - uint32_t dst_noc_y_start = get_arg_val(7); - uint32_t dst_noc_x_end = get_arg_val(8); - uint32_t dst_noc_y_end = get_arg_val(9); - uint32_t num_dests = get_arg_val(10); - uint32_t exclude_start_x = get_arg_val(11); - uint32_t exclude_start_y = get_arg_val(12); - uint32_t exclude_dir_x = get_arg_val(13); - uint32_t exclude_dir_y = get_arg_val(14); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t src_buffer_size = get_arg_val(2); + + uint32_t local_addr = get_arg_val(3); + + uint32_t dst_addr = get_arg_val(4); + uint32_t dst_noc_x_start = get_arg_val(5); + uint32_t dst_noc_y_start = get_arg_val(6); + uint32_t dst_noc_x_end = get_arg_val(7); + uint32_t dst_noc_y_end = get_arg_val(8); + uint32_t num_dests = get_arg_val(9); + uint32_t exclude_start_x = get_arg_val(10); + uint32_t exclude_start_y = get_arg_val(11); + uint32_t exclude_dir_x = get_arg_val(12); + uint32_t exclude_dir_y = get_arg_val(13); + // Read src buffer into local L1 buffer - uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp index 48feaea150e3..a6cc85e70bb9 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp @@ -5,22 +5,21 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t src_buffer_size = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t src_buffer_size = get_arg_val(2); - uint32_t local_addr = get_arg_val(4); + uint32_t local_addr = get_arg_val(3); - uint32_t dst_addr = get_arg_val(5); - uint32_t dst_noc_x_start = get_arg_val(6); - uint32_t dst_noc_y_start = get_arg_val(7); - uint32_t dst_noc_x_end = get_arg_val(8); - uint32_t dst_noc_y_end = get_arg_val(9); - uint32_t num_dests = get_arg_val(10); + uint32_t dst_addr = get_arg_val(4); + uint32_t dst_noc_x_start = get_arg_val(5); + uint32_t dst_noc_y_start = get_arg_val(6); + uint32_t dst_noc_x_end = get_arg_val(7); + uint32_t dst_noc_y_end = get_arg_val(8); + uint32_t num_dests = get_arg_val(9); // Read src buffer into local L1 buffer - uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp index 58c40e74cdd3..38fe828aa94b 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp @@ -7,14 +7,13 @@ void kernel_main() { // Kernel args - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles_r = get_arg_val(3); - uint32_t num_tiles_c = get_arg_val(4); + uint32_t src_addr = get_arg_val(0); + uint32_t src_bank_id = get_arg_val(1); + uint32_t num_tiles_r = get_arg_val(2); + uint32_t num_tiles_c = get_arg_val(3); // How many bytes along a row in the original tensor - uint32_t num_bytes_per_tensor_row = get_arg_val(5); + uint32_t num_bytes_per_tensor_row = get_arg_val(4); /* Constants @@ -43,7 +42,7 @@ void kernel_main() { uint32_t src_addr_ = src_addr + start_dram_addr_offset_for_tensor_row; for (uint32_t k = 0; k < num_tiles_c; k++) { cb_reserve_back(cb_id_in0, 1); - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr_); + uint64_t src_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr_); // Read one row of data uint32_t l1_write_addr = get_write_ptr(cb_id_in0); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp index 3cfaf979f602..4a70af98b12a 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp @@ -12,20 +12,18 @@ // addresses must be provided in the order in which tiles are generated. It expects src1 data to already be tilized and // it simply copies it to L1. void kernel_main() { - std::uint32_t dram_buffer_src0_addr = get_arg_val(0); - std::uint32_t dram_src0_noc_x = get_arg_val(1); - std::uint32_t dram_src0_noc_y = get_arg_val(2); - std::uint32_t dram_buffer_src1_addr = get_arg_val(3); - std::uint32_t dram_src1_noc_x = get_arg_val(4); - std::uint32_t dram_src1_noc_y = get_arg_val(5); - std::uint32_t address_map_size = get_arg_val(6); - std::uint32_t address_map_l1_addr = get_arg_val(7); - std::uint32_t num_blocks = get_arg_val(8); - std::uint32_t src0_num_reads_per_block = get_arg_val(9); - std::uint32_t src0_dram_read_size_bytes = get_arg_val(10); - std::uint32_t src1_num_bytes_per_block = get_arg_val(11); - std::uint32_t src0_num_tiles_per_block = get_arg_val(12); - std::uint32_t src1_num_tiles_per_block = get_arg_val(13); + std::uint32_t dram_buffer_src0_addr = get_arg_val(0); + std::uint32_t dram_src0_bank_id = get_arg_val(1); + std::uint32_t dram_buffer_src1_addr = get_arg_val(2); + std::uint32_t dram_src1_bank_id = get_arg_val(3); + std::uint32_t address_map_size = get_arg_val(4); + std::uint32_t address_map_l1_addr = get_arg_val(5); + std::uint32_t num_blocks = get_arg_val(6); + std::uint32_t src0_num_reads_per_block = get_arg_val(7); + std::uint32_t src0_dram_read_size_bytes = get_arg_val(8); + std::uint32_t src1_num_bytes_per_block = get_arg_val(9); + std::uint32_t src0_num_tiles_per_block = get_arg_val(10); + std::uint32_t src1_num_tiles_per_block = get_arg_val(11); constexpr uint32_t cb0_id = 0; constexpr uint32_t cb1_id = 1; @@ -40,7 +38,7 @@ void kernel_main() { cb_reserve_back(cb1_id, src1_num_tiles_per_block); uint32_t l1_write0_addr = get_write_ptr(cb0_id); uint32_t l1_write1_addr = get_write_ptr(cb1_id); - std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr(dram_src1_noc_x, dram_src1_noc_y, dram_buffer_src1_addr); + std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr_from_bank_id(dram_src1_bank_id, dram_buffer_src1_addr); // src1 is already tilized in DRAM. Read the whole block of tiles in a single DRAM read access. noc_async_read(dram_buffer_src1_noc_addr, l1_write1_addr, src1_num_bytes_per_block); // src0 is not tilized in DRAM. @@ -49,7 +47,7 @@ void kernel_main() { for (uint32_t i = 0; i < src0_num_reads_per_block; i++) { uint32_t src_addr = source_addresses[source_addresses_list_index]; std::uint64_t dram_buffer_src0_noc_addr = - get_noc_addr(dram_src0_noc_x, dram_src0_noc_y, dram_buffer_src0_addr + src_addr); + get_noc_addr_from_bank_id(dram_src0_bank_id, dram_buffer_src0_addr + src_addr); noc_async_read(dram_buffer_src0_noc_addr, l1_write0_addr, src0_dram_read_size_bytes); l1_write0_addr += src0_dram_read_size_bytes; source_addresses_list_index += 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp index 8722a7aef9de..ce0c8c6c81c9 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp @@ -5,19 +5,18 @@ #include void kernel_main() { - std::uint32_t dram_buffer_src_addr = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t l1_buffer_src_addr_base = get_arg_val(3); - std::uint32_t l1_buffer_dst_addr_base = get_arg_val(4); - std::uint32_t l1_dst_noc_x = get_arg_val(5); - std::uint32_t l1_dst_noc_y = get_arg_val(6); - std::uint32_t num_tiles = get_arg_val(7); - std::uint32_t single_tile_size_bytes = get_arg_val(8); - std::uint32_t total_tiles_size_bytes = get_arg_val(9); + std::uint32_t dram_buffer_src_addr = get_arg_val(0); + std::uint32_t src_bank_id = get_arg_val(1); + std::uint32_t l1_buffer_src_addr_base = get_arg_val(2); + std::uint32_t l1_buffer_dst_addr_base = get_arg_val(3); + std::uint32_t l1_dst_noc_x = get_arg_val(4); + std::uint32_t l1_dst_noc_y = get_arg_val(5); + std::uint32_t num_tiles = get_arg_val(6); + std::uint32_t single_tile_size_bytes = get_arg_val(7); + std::uint32_t total_tiles_size_bytes = get_arg_val(8); - // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + // DRAM NOC src address + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(src_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_src_addr_base, total_tiles_size_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp index f9b35831b77a..fbdea334c76e 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp @@ -7,17 +7,15 @@ void kernel_main() { uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); - uint32_t src1_noc_x = get_arg_val(5); - uint32_t src1_noc_y = get_arg_val(6); - // skip arg 7 for compat with reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_bank_id = get_arg_val(4); + // skip arg 5 for compat with reader_diff_lengths + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; @@ -32,7 +30,7 @@ void kernel_main() { uint32_t num_tiles = src0_num_tiles; uint32_t i1 = 0; for (uint32_t i = 0; i < NCHtWt; i += onetile) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_addr); cb_reserve_back(cb_id_in0, onetile); l1_write_addr_in0 = get_write_ptr(cb_id_in0); noc_async_read(src0_noc_addr, l1_write_addr_in0, tile_bytes); @@ -43,7 +41,7 @@ void kernel_main() { // for each W-tile of the first tensor we push one tile from the second arg tile list // but we loop the second list around cb_reserve_back(cb_id_in1, onetile); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); l1_write_addr_in1 = get_write_ptr(cb_id_in1); noc_async_read(src1_noc_addr, l1_write_addr_in1, tile_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp index 1433de4f3226..95c84b7e2ce6 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp @@ -6,15 +6,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); - uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); + uint32_t nc1 = get_arg_val(10); // if 1 we expect the bcast tensor to have NC=1 constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp index c99e2f8be698..e1396e60662d 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp @@ -6,15 +6,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); - uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 and wrap around in NC + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); + uint32_t nc1 = get_arg_val(10); // if 1 we expect the bcast tensor to have NC=1 and wrap around in NC constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp index c438c2051809..c1af622df03c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp @@ -7,17 +7,16 @@ void kernel_main() { uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); - uint32_t src1_noc_x = get_arg_val(5); - uint32_t src1_noc_y = get_arg_val(6); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_bank_id = get_arg_val(4); + uint32_t src1_noc_y = get_arg_val(5); // skip arg 7 for compat with reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp index 38cff5068ae1..fb4b2e89e743 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp @@ -6,15 +6,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); // skip args 1,2,5,6,7 for compat with single-bank readers and reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); - uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); + uint32_t nc1 = get_arg_val(10); // if 1 we expect the bcast tensor to have NC=1 constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp index f2c631499a97..34de8015346f 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp @@ -7,12 +7,10 @@ void kernel_main() { uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_tiles = get_arg_val(6); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_bank_id = get_arg_val(3); + uint32_t num_tiles = get_arg_val(4); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; @@ -26,9 +24,9 @@ void kernel_main() { uint32_t l1_write_addr_in1; // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + for (uint32_t i=0; i(src0_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); cb_reserve_back(cb_id_in1, ublock_size_tiles); @@ -47,19 +45,20 @@ void kernel_main() { src1_addr += ublock_size_bytes_1; } -// This input populates dest with values before binary operation -// executes, this is used to test eltwise binary with dest re-use -// and eltwise binary with dest accumulation -#if defined(DST_ACCUM_MODE) || defined(ELTWISE_DEST_REUSE_TYPE) - uint32_t src2_addr = get_arg_val(7); - uint32_t src2_noc_x = get_arg_val(8); - uint32_t src2_noc_y = get_arg_val(9); + + // This input populates dest with values before binary operation + // executes, this is used to test eltwise binary with dest re-use + // and eltwise binary with dest accumulation + #if defined(DST_ACCUM_MODE) || defined(ELTWISE_DEST_REUSE_TYPE) + uint32_t src2_addr = get_arg_val(5); + uint32_t src2_bank_id = get_arg_val(6); + constexpr uint32_t cb_id_in2 = 2; uint32_t ublock_size_bytes_2 = get_tile_size(cb_id_in2); uint32_t l1_write_addr_in2; - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src2_noc_addr = get_noc_addr(src2_noc_x, src2_noc_y, src2_addr); + for (uint32_t i=0; i(src2_bank_id, src2_addr); cb_reserve_back(cb_id_in2, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp index 5094e3e2ce48..28b68964bbd3 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp @@ -5,18 +5,13 @@ #include #include "dataflow_api.h" -inline __attribute__((always_inline)) void read_and_push_to_cb( - const uint32_t cb_id, - uint32_t num_tiles_per_cb, - uint32_t ublock_size_tiles, - uint32_t ublock_size_bytes, - uint32_t dram_src_noc_x, - uint32_t dram_src_noc_y, - uint32_t& dram_buffer_src_addr) { +inline __attribute__((always_inline)) +void read_and_push_to_cb(const uint32_t cb_id, uint32_t num_tiles_per_cb, uint32_t ublock_size_tiles, uint32_t ublock_size_bytes, + uint32_t bank_id, uint32_t& dram_buffer_src_addr) { // read a ublock of tiles at the time from DRAM to L1 buffer, and push a ublock at the time to unpacker for (uint32_t i = 0; i < num_tiles_per_cb; i += ublock_size_tiles) { // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_src_addr); cb_reserve_back(cb_id, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id); @@ -29,21 +24,14 @@ inline __attribute__((always_inline)) void read_and_push_to_cb( } void kernel_main() { - std::uint32_t dram_buffer_src_addr = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t num_tiles_per_cb = get_arg_val(3); + std::uint32_t dram_buffer_src_addr = get_arg_val(0); + std::uint32_t bank_id = get_arg_val(1); + std::uint32_t num_tiles_per_cb = get_arg_val(2); constexpr uint32_t cb_id = get_compile_time_arg_val(0); constexpr uint32_t ublock_size_tiles = get_compile_time_arg_val(1); uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; - read_and_push_to_cb( - cb_id, - num_tiles_per_cb, - ublock_size_tiles, - ublock_size_bytes, - dram_src_noc_x, - dram_src_noc_y, - dram_buffer_src_addr); + read_and_push_to_cb(cb_id, num_tiles_per_cb, ublock_size_tiles, ublock_size_bytes, + bank_id, dram_buffer_src_addr); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp index bd6ae074e7f1..a2db955dc177 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp @@ -7,10 +7,10 @@ void kernel_main() { // same arg indices as in reader_binary_diff_lenghts for compat - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); - uint32_t src1_num_tiles = get_arg_val(7); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_num_tiles = get_arg_val(5); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp index ff484517aa0c..b316eb012524 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp @@ -8,10 +8,9 @@ void kernel_main() { std::uint32_t buffer_src_addr = get_arg_val(0); - std::uint32_t src_noc_x = get_arg_val(1); - std::uint32_t src_noc_y = get_arg_val(2); - std::uint32_t num_tiles = get_arg_val(3); - std::uint32_t num_repetitions = get_arg_val(4); + std::uint32_t src_bank_id = get_arg_val(1); + std::uint32_t num_tiles = get_arg_val(2); + std::uint32_t num_repetitions = get_arg_val(3); constexpr uint32_t cb_id = get_compile_time_arg_val(0); constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1); @@ -20,7 +19,7 @@ void kernel_main() { for (uint32_t j = 0; j < num_repetitions; j++) { uint32_t src_addr = buffer_src_addr; for (uint32_t i = 0; i < num_tiles; i += block_size_tiles) { - std::uint64_t buffer_src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + std::uint64_t buffer_src_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr); cb_reserve_back(cb_id, block_size_tiles); if (j == 0) { diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp index 3511bd19d3b9..ae86ef804e82 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp @@ -6,17 +6,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_blocks = get_arg_val(6); - uint32_t in0_block_tile_cnt = get_arg_val(7); - uint32_t in1_block_tile_cnt = get_arg_val(8); - uint32_t in0_block_size_bytes = get_arg_val(9); - uint32_t in1_block_size_bytes = get_arg_val(10); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_bank_id = get_arg_val(3); + uint32_t num_blocks = get_arg_val(4); + uint32_t in0_block_tile_cnt = get_arg_val(5); + uint32_t in1_block_tile_cnt = get_arg_val(6); + uint32_t in0_block_size_bytes = get_arg_val(7); + uint32_t in1_block_size_bytes = get_arg_val(8); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; @@ -24,9 +22,9 @@ void kernel_main() { uint32_t l1_write_addr_in0; uint32_t l1_write_addr_in1; - for (uint32_t i = 0; i < num_blocks; i++) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + for(uint32_t i = 0; i < num_blocks; i++) { + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); cb_reserve_back(cb_id_in0, in0_block_tile_cnt); cb_reserve_back(cb_id_in1, in1_block_tile_cnt); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp index 82e8b919bd3f..81e12d58c519 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp @@ -7,14 +7,12 @@ void kernel_main() { std::uint32_t dram_buffer_src0_addr = get_arg_val(0); - std::uint32_t dram_src0_noc_x = get_arg_val(1); - std::uint32_t dram_src0_noc_y = get_arg_val(2); + std::uint32_t dram_src0_bank_id = get_arg_val(1); - std::uint32_t dram_buffer_src1_addr = get_arg_val(3); - std::uint32_t dram_src1_noc_x = get_arg_val(4); - std::uint32_t dram_src1_noc_y = get_arg_val(5); + std::uint32_t dram_buffer_src1_addr = get_arg_val(2); + std::uint32_t dram_src1_bank_id = get_arg_val(3); - std::uint32_t num_tiles = get_arg_val(6); + std::uint32_t num_tiles = get_arg_val(4); // single-tile chunks uint32_t chunk_size_bytes_0 = get_tile_size(0); @@ -27,8 +25,10 @@ void kernel_main() { // read a chunk of tiles at the time from DRAM to L1 buffer, and push a chunk at the time to unpacker for (uint32_t i = 0; i < num_tiles; i += chunk_size_tiles) { // DRAM NOC src address - std::uint64_t dram_buffer_src0_noc_addr = get_noc_addr(dram_src0_noc_x, dram_src0_noc_y, dram_buffer_src0_addr); - std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr(dram_src1_noc_x, dram_src1_noc_y, dram_buffer_src1_addr); + std::uint64_t dram_buffer_src0_noc_addr = + get_noc_addr_from_bank_id(dram_src0_bank_id, dram_buffer_src0_addr); + std::uint64_t dram_buffer_src1_noc_addr = + get_noc_addr_from_bank_id(dram_src1_bank_id, dram_buffer_src1_addr); cb_reserve_back(0, chunk_size_tiles); cb_reserve_back(1, chunk_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp index 56ed2b2823cd..9c4693d04507 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp @@ -6,32 +6,28 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_blocks = get_arg_val(6); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_dram_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_dram_bank_id = get_arg_val(3); + uint32_t num_blocks = get_arg_val(4); - uint32_t in0_block_tile_cnt = get_arg_val(7); - uint32_t in1_block_tile_cnt = get_arg_val(8); - uint32_t in0_block_size_bytes = get_arg_val(9); - uint32_t in1_block_size_bytes = get_arg_val(10); + uint32_t in0_block_tile_cnt = get_arg_val(5); + uint32_t in1_block_tile_cnt = get_arg_val(6); + uint32_t in0_block_size_bytes = get_arg_val(7); + uint32_t in1_block_size_bytes = get_arg_val(8); - uint32_t with_bias = get_arg_val(11); + uint32_t with_bias = get_arg_val(9); uint32_t src2_addr; - uint32_t src2_noc_x; - uint32_t src2_noc_y; + uint32_t src2_dram_bank_id; uint32_t in2_block_tile_cnt; uint32_t in2_block_size_bytes; if (with_bias) { - src2_addr = get_arg_val(12); - src2_noc_x = get_arg_val(13); - src2_noc_y = get_arg_val(14); - in2_block_tile_cnt = get_arg_val(15); - in2_block_size_bytes = get_arg_val(16); + src2_addr = get_arg_val(10); + src2_dram_bank_id = get_arg_val(11); + in2_block_tile_cnt = get_arg_val(12); + in2_block_size_bytes = get_arg_val(13); } constexpr uint32_t cb_id_in0 = 0; @@ -42,9 +38,9 @@ void kernel_main() { uint32_t l1_write_addr_in1; uint32_t l1_write_addr_in2; - for (uint32_t i = 0; i < num_blocks; i++) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + for(uint32_t i = 0; i < num_blocks; i++) { + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_dram_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_dram_bank_id, src1_addr); cb_reserve_back(cb_id_in0, in0_block_tile_cnt); cb_reserve_back(cb_id_in1, in1_block_tile_cnt); @@ -65,7 +61,7 @@ void kernel_main() { } if (with_bias) { - uint64_t src2_noc_addr = get_noc_addr(src2_noc_x, src2_noc_y, src2_addr); + uint64_t src2_noc_addr = get_noc_addr_from_bank_id(src2_dram_bank_id, src2_addr); l1_write_addr_in2 = get_write_ptr(cb_id_in2); cb_reserve_back(cb_id_in2, in2_block_tile_cnt); noc_async_read(src2_noc_addr, l1_write_addr_in2, in2_block_size_bytes); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp index 6e5f737b34d0..df96a7dd048c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp @@ -6,10 +6,9 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; @@ -19,12 +18,12 @@ void kernel_main() { // read a ublock of tiles from src to CB, and then push the ublock to unpacker for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id_in0); - noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes); + noc_async_read(src_buffer_noc_addr, l1_write_addr, ublock_size_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp index c191ef7a83c5..707db8698543 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp @@ -5,18 +5,17 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); - uint32_t cb_id_in0 = get_arg_val(4); - uint32_t ublock_size_tiles = get_arg_val(5); - bool reader_only = get_arg_val(6); + uint32_t src_addr = get_arg_val(0); + uint32_t src_dram_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + uint32_t cb_id_in0 = get_arg_val(3); + uint32_t ublock_size_tiles = get_arg_val(4); + bool reader_only = get_arg_val(5); uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles; - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t i = 0; i(src_dram_bank_id, src_addr); if (reader_only == false) { cb_reserve_back(cb_id_in0, ublock_size_tiles); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp index a7f37fda7d0c..b866fd166327 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp @@ -6,9 +6,10 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); + uint32_t src_addr = get_arg_val(0); + uint32_t src_dram_bank_id = get_arg_val(1); + // uint32_t unused = get_arg_val(2); + // uint32_t unused = get_arg_val(3); // skip 3 for compat with reader_unary_8bank, reader_unary uint32_t N = get_arg_val(4); uint32_t Ht = get_arg_val(5); @@ -27,9 +28,9 @@ void kernel_main() { // this reader will read a NHW tensor in NWH order for (uint32_t n = 0; n < N; n++) { src_addr = src_addrN; - for (uint32_t w = 0; w < Wt; w++) { - for (uint32_t h = 0; h < Ht; h++) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t w = 0; w(src_dram_bank_id, src_addr); cb_reserve_back(cb_id_in0, onetile); uint32_t l1_write_addr = get_write_ptr(cb_id_in0); noc_async_read(src_noc_addr, l1_write_addr, tile_bytes); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp index 59efcbcb26e0..89391089a7ea 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp @@ -12,17 +12,16 @@ constexpr static std::uint32_t VALID_VAL = 0x1234; constexpr static std::uint32_t INVALID_VAL = 0x4321; void kernel_main() { - std::uint32_t buffer_src_addr = get_arg_val(0); - std::uint32_t src_noc_x = get_arg_val(1); - std::uint32_t src_noc_y = get_arg_val(2); - std::uint32_t buffer_dst_addr = get_arg_val(3); - std::uint32_t dst_noc_x = get_arg_val(4); - std::uint32_t dst_noc_y = get_arg_val(5); - std::uint32_t l1_buffer_address = get_arg_val(6); - std::uint32_t stream_register_address = get_arg_val(7); - std::uint32_t num_tiles = get_arg_val(8); - std::uint32_t transient_buffer_size_tiles = get_arg_val(9); - std::uint32_t transient_buffer_size_bytes = get_arg_val(10); + std::uint32_t buffer_src_addr = get_arg_val(0); + std::uint32_t src_noc_x = get_arg_val(1); + std::uint32_t src_noc_y = get_arg_val(2); + std::uint32_t buffer_dst_addr = get_arg_val(3); + std::uint32_t bank_id = get_arg_val(4); + std::uint32_t l1_buffer_address = get_arg_val(5); + std::uint32_t stream_register_address = get_arg_val(6); + std::uint32_t num_tiles = get_arg_val(7); + std::uint32_t transient_buffer_size_tiles = get_arg_val(8); + std::uint32_t transient_buffer_size_bytes = get_arg_val(9); // Scratch address in L1, two write register value before we copy it to into local/remote registers volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp index 5400fc6a2254..32dfa63f0436 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp @@ -21,20 +21,19 @@ inline std::uint32_t ping_pong_address(std::uint32_t addr1, std::uint32_t addr2, } void kernel_main() { - std::uint32_t buffer_src_addr1 = get_arg_val(0); - std::uint32_t buffer_src_addr2 = get_arg_val(1); - std::uint32_t src_noc_x = get_arg_val(2); - std::uint32_t src_noc_y = get_arg_val(3); - std::uint32_t buffer_dst_addr = get_arg_val(4); - std::uint32_t dst_noc_x = get_arg_val(5); - std::uint32_t dst_noc_y = get_arg_val(6); - std::uint32_t local_buffer_addr1 = get_arg_val(7); - std::uint32_t local_buffer_addr2 = get_arg_val(8); - std::uint32_t stream_register_address1 = get_arg_val(9); - std::uint32_t stream_register_address2 = get_arg_val(10); - std::uint32_t num_tiles = get_arg_val(11); - std::uint32_t transient_buffer_size_tiles = get_arg_val(12); - std::uint32_t transient_buffer_size_bytes = get_arg_val(13); + std::uint32_t buffer_src_addr1 = get_arg_val(0); + std::uint32_t buffer_src_addr2 = get_arg_val(1); + std::uint32_t src_noc_x = get_arg_val(2); + std::uint32_t src_noc_y = get_arg_val(3); + std::uint32_t buffer_dst_addr = get_arg_val(4); + std::uint32_t bank_id = get_arg_val(5); + std::uint32_t local_buffer_addr1 = get_arg_val(6); + std::uint32_t local_buffer_addr2 = get_arg_val(7); + std::uint32_t stream_register_address1 = get_arg_val(8); + std::uint32_t stream_register_address2 = get_arg_val(9); + std::uint32_t num_tiles = get_arg_val(10); + std::uint32_t transient_buffer_size_tiles = get_arg_val(11); + std::uint32_t transient_buffer_size_bytes = get_arg_val(12); // Scratch address in L1, two write register value before we copy it to into local/remote registers volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); @@ -57,7 +56,7 @@ void kernel_main() { noc_async_read_barrier(); // DRAM NOC dst address - dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_buffer_addr); + dst_noc_addr = get_noc_addr_from_bank_id(bank_id, dst_buffer_addr); noc_async_write(local_buffer_address, dst_noc_addr, transient_buffer_size_bytes); dst_buffer_addr += transient_buffer_size_bytes; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp index 92af4e921780..160f086849c6 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp @@ -14,14 +14,13 @@ inline uint32_t TADDR(uint32_t ti) { return ti << 11; } void kernel_main() { uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t W = get_arg_val(3); - uint32_t H = get_arg_val(4); - uint32_t C = get_arg_val(5); - uint32_t HW = get_arg_val(6); - uint32_t N = get_arg_val(7); - uint32_t CHW = get_arg_val(8); + uint32_t src_bank_id = get_arg_val(1); + uint32_t W = get_arg_val(2); + uint32_t H = get_arg_val(3); + uint32_t C = get_arg_val(4); + uint32_t HW = get_arg_val(5); + uint32_t N = get_arg_val(6); + uint32_t CHW = get_arg_val(7); auto WT = (W >> 5); // number of tiles in W auto HT = (H >> 5); // number of tiles in H @@ -36,7 +35,7 @@ void kernel_main() { // The basic idea here is to iterate over output tiles (that will be over CT,WT) and H // this will generate a linearly incremented output address in the inner loop // we then reverse map this linear dest address to src address - uint64_t batch_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t batch_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr); for (uint32_t n = 0; n < N; n++) { uint32_t htWT = 0; for (uint32_t h = 0; h < H; h++) { diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp index a5a5d5602459..2a459afdaea1 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp @@ -14,14 +14,13 @@ inline uint32_t TADDR(uint32_t ti) { return ti << 11; } void kernel_main() { uint32_t src0_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t W = get_arg_val(3); - uint32_t H = get_arg_val(4); - uint32_t C = get_arg_val(5); - uint32_t HW = get_arg_val(6); - uint32_t N = get_arg_val(7); - uint32_t CHW = get_arg_val(8); + uint32_t src_bank_id = get_arg_val(1); + uint32_t W = get_arg_val(2); + uint32_t H = get_arg_val(3); + uint32_t C = get_arg_val(4); + uint32_t HW = get_arg_val(5); + uint32_t N = get_arg_val(6); + uint32_t CHW = get_arg_val(7); auto WT = (W >> 5); // number of tiles in W auto HT = (H >> 5); // number of tiles in H diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp index a279088cd9d2..5c2b0ee98767 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "dataflow_api.h" /** * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or @@ -11,15 +12,14 @@ * */ void kernel_main() { std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); + std::uint32_t src_dram_bank_id = get_arg_val(1); - std::uint32_t l1_buffer_dst_addr_base = get_arg_val(3); - std::uint32_t dram_buffer_size = get_arg_val(4); + std::uint32_t l1_buffer_dst_addr_base = get_arg_val(2); + std::uint32_t dram_buffer_size = get_arg_val(3); std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(src_dram_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_dst_addr_base, dram_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp index df99b8def697..dc562bb8080b 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp @@ -8,18 +8,17 @@ void kernel_main() { const uint32_t cb_id = get_compile_time_arg_val(0); - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t src_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); // ublocks size defined in tiles constexpr uint32_t ublock_size_tiles = 1; uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; // read a ublock of tiles from src to CB, and then push the ublock to unpacker - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t i = 0; i(src_bank_id, src_addr); cb_reserve_back(cb_id, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp index 79b373a241cd..e7e156f701a3 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "dataflow_api.h" /** * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on NCRISC or @@ -11,16 +12,14 @@ * */ void kernel_main() { std::uint32_t dram_buffer_dst_addr_base = get_arg_val(0); - std::uint32_t dram_dst_noc_x = get_arg_val(1); - std::uint32_t dram_dst_noc_y = get_arg_val(2); - - std::uint32_t l1_buffer_src_addr_base = get_arg_val(3); - std::uint32_t dram_buffer_size = get_arg_val(4); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t l1_buffer_src_addr_base = get_arg_val(2); + std::uint32_t dram_buffer_size = get_arg_val(3); std::uint32_t dram_buffer_dst_addr = dram_buffer_dst_addr_base; // DRAM NOC dst address - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_dst_addr); noc_async_write(l1_buffer_src_addr_base, dram_buffer_dst_noc_addr, dram_buffer_size); noc_async_write_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp index b535b7bd4bf9..ec8faac45c7c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp @@ -6,17 +6,16 @@ void kernel_main() { const uint32_t cb_id = get_compile_time_arg_val(0); - uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); // single-tile ublocks uint32_t ublock_size_bytes = get_tile_size(cb_id); uint32_t ublock_size_tiles = 1; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_addr); cb_wait_front(cb_id, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp index c021e07b2de5..775cb2e30e9d 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp @@ -6,17 +6,16 @@ void kernel_main() { std::uint32_t dram_buffer_dst_addr = get_arg_val(0); - std::uint32_t dram_dst_noc_x = get_arg_val(1); - std::uint32_t dram_dst_noc_y = get_arg_val(2); - std::uint32_t remaining_bytes = get_arg_val(3); - std::uint32_t num_loops = get_arg_val(4); - std::uint32_t num_bytes = get_arg_val(5); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t remaining_bytes = get_arg_val(2); + std::uint32_t num_loops = get_arg_val(3); + std::uint32_t num_bytes = get_arg_val(4); // DRAM NOC dst address for (uint32_t i = 0; i < num_loops; i++) { eth_wait_for_bytes(num_bytes); - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_dst_addr); noc_async_write(eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, dram_buffer_dst_noc_addr, num_bytes); noc_async_write_barrier(); @@ -25,7 +24,7 @@ void kernel_main() { } eth_wait_for_bytes(remaining_bytes); - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_dst_addr); noc_async_write(eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, dram_buffer_dst_noc_addr, remaining_bytes); noc_async_write_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp index d35795495357..c4acbf759349 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp @@ -6,15 +6,14 @@ void kernel_main() { std::uint32_t dram_buffer_src_addr = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t remaining_bytes = get_arg_val(3); - std::uint32_t num_loops = get_arg_val(4); - std::uint32_t num_bytes = get_arg_val(5); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t remaining_bytes = get_arg_val(2); + std::uint32_t num_loops = get_arg_val(3); + std::uint32_t num_bytes = get_arg_val(4); // DRAM NOC src address for (uint32_t i = 0; i < num_loops; i++) { - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, num_bytes); noc_async_read_barrier(); @@ -26,7 +25,7 @@ void kernel_main() { dram_buffer_src_addr += num_bytes; } - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, remaining_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp index 9b744d4051cd..e626f76f96af 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "dataflow_api.h" /** * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or @@ -12,14 +13,13 @@ void kernel_main() { std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t dram_buffer_size = get_arg_val(3); - std::uint32_t local_eth_l1_addr_base = get_arg_val(4); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t dram_buffer_size = get_arg_val(2); + std::uint32_t local_eth_l1_addr_base = get_arg_val(3); std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, local_eth_l1_addr_base, dram_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp index 8ac1f1502e93..cf2c0b3087bc 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "dataflow_api.h" /** * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on NCRISC or @@ -11,15 +12,14 @@ * */ void kernel_main() { std::uint32_t dram_buffer_dst_addr_base = get_arg_val(0); - std::uint32_t dram_dst_noc_x = get_arg_val(1); - std::uint32_t dram_dst_noc_y = get_arg_val(2); - std::uint32_t dram_buffer_size = get_arg_val(3); - std::uint32_t local_eth_l1_addr_base = get_arg_val(4); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t dram_buffer_size = get_arg_val(2); + std::uint32_t local_eth_l1_addr_base = get_arg_val(3); std::uint32_t dram_buffer_dst_addr = dram_buffer_dst_addr_base; // DRAM NOC dst address - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_dst_addr); noc_async_write(local_eth_l1_addr_base, dram_buffer_dst_noc_addr, dram_buffer_size); noc_async_write_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_binary.cpp index f788e0a7141d..d427c9490a14 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_binary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_binary.cpp @@ -5,23 +5,21 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst0_addr = get_arg_val(0); - uint32_t dst0_noc_x = get_arg_val(1); - uint32_t dst0_noc_y = get_arg_val(2); - uint32_t cb_id_out0 = get_arg_val(3); - uint32_t dst1_addr = get_arg_val(4); - uint32_t dst1_noc_x = get_arg_val(5); - uint32_t dst1_noc_y = get_arg_val(6); - uint32_t cb_id_out1 = get_arg_val(7); - uint32_t num_tiles = get_arg_val(8); - uint32_t ublock_size_tiles = get_arg_val(9); + uint32_t dst0_addr = get_arg_val(0); + uint32_t dst0_dram_bank_id = get_arg_val(1); + uint32_t cb_id_out0 = get_arg_val(2); + uint32_t dst1_addr = get_arg_val(3); + uint32_t dst1_dram_bank_id = get_arg_val(4); + uint32_t cb_id_out1 = get_arg_val(5); + uint32_t num_tiles = get_arg_val(6); + uint32_t ublock_size_tiles = get_arg_val(7); uint32_t ublock0_size_bytes = get_tile_size(cb_id_out0) * ublock_size_tiles; uint32_t ublock1_size_bytes = get_tile_size(cb_id_out1) * ublock_size_tiles; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst0_noc_addr = get_noc_addr(dst0_noc_x, dst0_noc_y, dst0_addr); - uint64_t dst1_noc_addr = get_noc_addr(dst1_noc_x, dst1_noc_y, dst1_addr); + uint64_t dst0_noc_addr = get_noc_addr_from_bank_id(dst0_dram_bank_id, dst0_addr); + uint64_t dst1_noc_addr = get_noc_addr_from_bank_id(dst1_dram_bank_id, dst1_addr); cb_wait_front(cb_id_out0, ublock_size_tiles); cb_wait_front(cb_id_out1, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp index ccbac8d1439b..76c0fd42dcd9 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp @@ -4,17 +4,12 @@ #include "dataflow_api.h" -inline __attribute__((always_inline)) void pop_from_cb_and_write( - const uint32_t cb_id, - uint32_t num_tiles_per_cb, - uint32_t ublock_size_tiles, - uint32_t ublock_size_bytes, - uint32_t dram_dst_noc_x, - uint32_t dram_dst_noc_y, - uint32_t& dram_buffer_dst_addr) { +inline __attribute__((always_inline)) +void pop_from_cb_and_write(const uint32_t cb_id, uint32_t num_tiles_per_cb, uint32_t ublock_size_tiles, uint32_t ublock_size_bytes, + uint32_t bank_id, uint32_t& dram_buffer_dst_addr) { for (uint32_t i = 0; i < num_tiles_per_cb; i += ublock_size_tiles) { // DRAM NOC dst address - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_dst_addr); cb_wait_front(cb_id, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id); @@ -27,21 +22,14 @@ inline __attribute__((always_inline)) void pop_from_cb_and_write( } void kernel_main() { - std::uint32_t dram_buffer_dst_addr = get_arg_val(0); - std::uint32_t dram_dst_noc_x = get_arg_val(1); - std::uint32_t dram_dst_noc_y = get_arg_val(2); - std::uint32_t num_tiles_per_cb = get_arg_val(3); + std::uint32_t dram_buffer_dst_addr = get_arg_val(0); + std::uint32_t bank_id = get_arg_val(1); + std::uint32_t num_tiles_per_cb = get_arg_val(2); constexpr uint32_t cb_id = get_compile_time_arg_val(0); constexpr uint32_t ublock_size_tiles = get_compile_time_arg_val(1); uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; - pop_from_cb_and_write( - cb_id, - num_tiles_per_cb, - ublock_size_tiles, - ublock_size_bytes, - dram_dst_noc_x, - dram_dst_noc_y, - dram_buffer_dst_addr); + pop_from_cb_and_write(cb_id, num_tiles_per_cb, ublock_size_tiles, ublock_size_bytes, + bank_id, dram_buffer_dst_addr); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp index 260382f223f9..5e5f7583d7e7 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp @@ -12,6 +12,11 @@ void kernel_main() { std::uint32_t num_tiles = get_arg_val(3); std::uint32_t num_repetitions = get_arg_val(4); + std::uint32_t buffer_dst_addr = get_arg_val(0); + std::uint32_t dst_bank_id = get_arg_val(1); + std::uint32_t num_tiles = get_arg_val(2); + std::uint32_t num_repetitions = get_arg_val(3); + constexpr uint32_t cb_id = get_compile_time_arg_val(0); constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1); @@ -20,7 +25,7 @@ void kernel_main() { for (uint32_t j = 0; j < num_repetitions; j++) { uint32_t dst_addr = buffer_dst_addr; for (uint32_t i = 0; i < num_tiles; i += block_size_tiles) { - std::uint64_t buffer_dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + std::uint64_t buffer_dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_addr); cb_wait_front(cb_id, block_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp index 4b8041fb5329..ffd7832242ca 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp @@ -5,8 +5,8 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t num_tiles = get_arg_val(3); // Index 3 to match with regular writer_unary + uint32_t dst_addr = get_arg_val(0); + uint32_t num_tiles = get_arg_val(2); // Index 2 to match with regular writer_unary constexpr uint32_t cb_id_out0 = 16; constexpr uint32_t onetile = 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_pop_n.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_pop_n.cpp index 23a0016c3118..67a502304573 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_pop_n.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_pop_n.cpp @@ -5,18 +5,17 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); - uint32_t cb_id_out0 = get_arg_val(4); - uint32_t ublock_size_tiles = get_arg_val(5); - bool writer_only = get_arg_val(6); + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_dram_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + uint32_t cb_id_out0 = get_arg_val(3); + uint32_t ublock_size_tiles = get_arg_val(4); + bool writer_only = get_arg_val(5); uint32_t ublock_size_bytes = get_tile_size(cb_id_out0) * ublock_size_tiles; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_dram_bank_id, dst_addr); if (writer_only == false) { cb_wait_front(cb_id_out0, ublock_size_tiles); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_transpose_wh.cpp index ebd23e453e32..b0c9cfcb1460 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_transpose_wh.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_transpose_wh.cpp @@ -6,8 +6,8 @@ void kernel_main() { uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); + uint32_t dst_dram_bank_id = get_arg_val(1); + // uint32_t unused = get_arg_val(2); // uint32_t num_tiles = get_arg_val(3); uint32_t N = get_arg_val(4); uint32_t Ht = get_arg_val(5); @@ -26,9 +26,9 @@ void kernel_main() { // this writer will write a NWH tensor in NHW order for (uint32_t n = 0; n < N; n++) { dst_addr = dst_addrN; - for (uint32_t w = 0; w < Wt; w++) { - for (uint32_t h = 0; h < Ht; h++) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + for (uint32_t w = 0; w(dst_dram_bank_id, dst_addr); cb_wait_front(cb_id_out0, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id_out0); noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp index 1bdbe5b2de4f..77cbac915db8 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp @@ -5,16 +5,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t inner_r = get_arg_val(3); - uint32_t inner_c = get_arg_val(4); - uint32_t num_sub_blocks_m = get_arg_val(5); - uint32_t num_sub_blocks_n = get_arg_val(6); - uint32_t stride_r = get_arg_val(7); - uint32_t stride_subblock_r = get_arg_val(8); - uint32_t stride_subblock_c = get_arg_val(9); + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_bank_id = get_arg_val(1); + uint32_t inner_r = get_arg_val(2); + uint32_t inner_c = get_arg_val(3); + uint32_t num_sub_blocks_m = get_arg_val(4); + uint32_t num_sub_blocks_n = get_arg_val(5); + uint32_t stride_r = get_arg_val(6); + uint32_t stride_subblock_r = get_arg_val(7); + uint32_t stride_subblock_c = get_arg_val(8); constexpr uint32_t cb_id_out0 = 16; @@ -29,8 +28,8 @@ void kernel_main() { uint32_t dram_address_r = dram_address_block_beginning; for (uint32_t r = 0; r < inner_r; r++) { uint32_t dram_address_c = dram_address_r; - for (uint32_t c = 0; c < inner_c; c++) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dram_address_c); + for(uint32_t c = 0; c < inner_c; c++) { + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dram_address_c); cb_wait_front(cb_id_out0, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id_out0); diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp index e8fb86620973..800f924f6e43 100644 --- a/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp @@ -33,11 +33,9 @@ void MAIN { // Read out the tile we want to print using BRISC, put it in c_in0 constexpr uint32_t cb_id = tt::CBIndex::c_0; #if defined(COMPILE_FOR_BRISC) - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint32_t src_addr = get_arg_val(0); + uint32_t src_bank_id = get_arg_val(1); + uint64_t src_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr); cb_reserve_back(cb_id, 1); noc_async_read(src_noc_addr, get_write_ptr(cb_id), get_tile_size(cb_id)); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp index 13406c2423b1..753fdfbe4961 100644 --- a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp @@ -39,9 +39,10 @@ void MAIN { #else tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); #endif - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/); } #else diff --git a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp index 7031c71c949e..e034b925d25d 100644 --- a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp @@ -69,7 +69,6 @@ int main(int argc, char** argv) { .buffer_type = tt_metal::BufferType::DRAM}; auto src_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); tt_metal::detail::WriteToBuffer(src_dram_buffer, src_vec); auto l1_to_l1_kernel = tt_metal::CreateKernel( @@ -84,15 +83,14 @@ int main(int argc, char** argv) { l1_to_l1_kernel, core, {dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - l1_buffer_addr, - l1_buffer_addr, - (uint32_t)dst_soc_core.x, - (uint32_t)dst_soc_core.y, - num_tiles, - tile_size_bytes, - total_tiles_size_bytes}); + 0, + l1_buffer_addr, + l1_buffer_addr, + (uint32_t)dst_soc_core.x, + (uint32_t)dst_soc_core.y, + num_tiles, + tile_size_bytes, + total_tiles_size_bytes}); } } diff --git a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp index b7f4dc8a7608..3d870b8fa378 100644 --- a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp @@ -279,17 +279,11 @@ bool test_matmul_large_block(tt_metal::Device* device, bool activations_rm, bool auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - const std::array mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks M * in0_block_w, // input 0 block num tiles N * in0_block_w, // input 1 block num tiles @@ -300,17 +294,12 @@ bool test_matmul_large_block(tt_metal::Device* device, bool activations_rm, bool string writer_kernel; if (output_rm) { writer_kernel = "tt_metal/kernels/dataflow/writer_unary.cpp"; - writer_rt_args = { - dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - uint(M * N)}; + writer_rt_args = {dst_dram_buffer->address(), (uint32_t)0, uint(M * N)}; } else { writer_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp"; writer_rt_args = { dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp index e26fd37047a9..63a1f3cc9721 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp @@ -383,10 +383,6 @@ int main(int argc, char** argv) { TT_FATAL(dram_buffer_src1_addr + dram_buffer_size_weights < 1024 * 1024 * 1024, "Error"); TT_FATAL(dram_buffer_dst_addr + dram_buffer_size_out < 1024 * 1024 * 1024, "Error"); - auto dram_src0_noc_xy = device->dram_core_from_dram_channel(dram_src0_channel_id); - auto dram_src1_noc_xy = device->dram_core_from_dram_channel(dram_src1_channel_id); - auto dram_dst_noc_xy = device->dram_core_from_dram_channel(dram_dst_channel_id); - auto activations_tilized = tilize(activation_slice, per_core_M * 32, K * 32); auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); @@ -402,11 +398,9 @@ int main(int argc, char** argv) { const std::array mm_reader_args = { (std::uint32_t)dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks (std::uint32_t)per_core_M * in0_block_w, // input 0 block num tiles (std::uint32_t)per_core_N * in0_block_w, // input 1 block num tiles @@ -415,8 +409,7 @@ int main(int argc, char** argv) { const std::array writer_args = { (std::uint32_t)dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)per_core_M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp index 4cf23e123cf5..5d849d447ede 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp @@ -147,11 +147,6 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -257,11 +252,9 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { vector reader_l1_args = { src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, K, M, N, @@ -270,13 +263,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { with_bias}; if (with_bias) { - auto dram_src2_noc_xy = src2_dram_buffer->noc_coordinates(); - vector bias_args = { - src2_dram_buffer->address(), - (std::uint32_t)dram_src2_noc_xy.x, - (std::uint32_t)dram_src2_noc_xy.y, - N, - N * single_tile_size}; + vector bias_args = {src2_dram_buffer->address(), (uint32_t)0, N, N * single_tile_size}; for (uint32_t arg : bias_args) { reader_l1_args.push_back(arg); @@ -285,11 +272,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { tt_metal::SetRuntimeArgs(program, mm_reader_kernel, core, reader_l1_args); - tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dst_dram_buffer->address(), (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, M * N}); + tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dst_dram_buffer->address(), (uint32_t)0, M * N}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp index 32f4b2fdffed..f45d6a460bbe 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp @@ -217,10 +217,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * in0_block_w * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -249,11 +245,9 @@ int main(int argc, char** argv) { const std::array mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks M * in0_block_w, // input 0 block num tiles N * in0_block_w, // input 1 block num tiles @@ -262,8 +256,7 @@ int main(int argc, char** argv) { const std::array writer_rt_args{ dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp index 78bdbc88de09..8264184a457b 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp @@ -218,10 +218,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * in0_block_w * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -251,11 +247,9 @@ int main(int argc, char** argv) { const std::array mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks M * in0_block_w, // input 0 block num tiles N * in0_block_w, // input 1 block num tiles @@ -264,8 +258,7 @@ int main(int argc, char** argv) { const std::array writer_rt_args{ dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp index de499d3528c9..42f5511eef98 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp @@ -52,10 +52,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(dram_config); auto dst_dram_buffer = CreateBuffer(dram_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -136,11 +132,9 @@ int main(int argc, char** argv) { mm_reader_kernel, core, {src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, 1, 1, 1, @@ -148,13 +142,7 @@ int main(int argc, char** argv) { 1 * single_tile_size}); tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tiles}); + program, unary_writer_kernel, core, {dst_dram_buffer->address(), (uint32_t)0, num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp index 8d508209c3ec..7966c969bc25 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp @@ -52,10 +52,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(dram_config); auto dst_dram_buffer = CreateBuffer(dram_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 1; tt_metal::CircularBufferConfig cb_src0_config = @@ -137,25 +133,22 @@ int main(int argc, char** argv) { mm_reader_kernel, core, {src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - 1, - 1, - 1, - 1 * single_tile_size, - 1 * single_tile_size}); + 0, + src1_dram_buffer->address(), + 0, + 1, + 1, + 1, + 1 * single_tile_size, + 1 * single_tile_size}); tt_metal::SetRuntimeArgs( program, unary_writer_kernel, core, {dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tiles}); + 0, + num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp index baf73e2b4bac..1a8a77bd6dec 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp @@ -58,9 +58,9 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(dram_config); auto dst_l1_buffer = CreateBuffer(l1_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto l1_dst_noc_xy = dst_l1_buffer->noc_coordinates(); + auto l1_dst_noc_xy = + device->virtual_core_from_logical_core(dst_l1_buffer->logical_core_from_bank_id(0), CoreType::WORKER); + ; uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 1; @@ -94,7 +94,7 @@ int main(int argc, char** argv) { auto unary_writer_kernel = tt_metal::CreateKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tt_metal/kernels/dataflow/writer_unary_1.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -143,16 +143,14 @@ int main(int argc, char** argv) { mm_reader_kernel, core, {src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - 1, - 1, - 1, - 1 * single_tile_size, - 1 * single_tile_size}); + 0, + src1_dram_buffer->address(), + 0, + 1, + 1, + 1, + 1 * single_tile_size, + 1 * single_tile_size}); tt_metal::SetRuntimeArgs( program, diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp index 194e2d9a4bb4..f3a6cd35a308 100644 --- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp +++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp @@ -89,11 +89,7 @@ void compile_and_configure_program( tt_metal::detail::WriteToBuffer(src_dram_buffer, src_vec); } -void set_rt_args( - tt_metal::Program& program, - tt_metal::KernelHandle kernel, - const CoreRange& core_range, - const std::array& rt_args) { +void set_rt_args(tt_metal::Program &program, tt_metal::KernelHandle kernel, const CoreRange &core_range, const std::array &rt_args) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord core = CoreCoord(x, y); @@ -109,22 +105,19 @@ void write_same_runtime_args_to_device( tt_metal::KernelHandle writer_kernel_id, const CoreRange& core_range, int32_t num_tiles, - tt_metal::Buffer& src_dram_buffer, - tt_metal::Buffer& dst_dram_buffer) { - auto dram_src_noc_xy = src_dram_buffer.noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer.noc_coordinates(); + tt_metal::Buffer &src_dram_buffer, + tt_metal::Buffer &dst_dram_buffer) +{ const std::array unary_reader_args{ - (std::uint32_t)src_dram_buffer.address(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - (std::uint32_t)num_tiles}; + (std::uint32_t)src_dram_buffer.address(), + (std::uint32_t) 0, + (std::uint32_t)num_tiles}; const std::array unary_writer_args{ - (std::uint32_t)dst_dram_buffer.address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - (std::uint32_t)num_tiles}; + (std::uint32_t)dst_dram_buffer.address(), + (std::uint32_t) 0, + (std::uint32_t)num_tiles}; set_rt_args(program, reader_kernel_id, core_range, unary_reader_args); set_rt_args(program, writer_kernel_id, core_range, unary_writer_args); @@ -138,37 +131,31 @@ void write_unique_writer_runtime_args_to_device( const CoreRange& core_range, const CoreRangeSet& core_blocks, int32_t num_tiles, - tt_metal::Buffer& src_dram_buffer, - tt_metal::Buffer& dst_dram_buffer_1, - tt_metal::Buffer& dst_dram_buffer_2, - tt_metal::Buffer& dst_dram_buffer_3) { - auto dram_src_noc_xy = src_dram_buffer.noc_coordinates(); - // All dst buffers use the same DRAM channel - auto dram_dst_noc_xy = dst_dram_buffer_1.noc_coordinates(); + tt_metal::Buffer &src_dram_buffer, + tt_metal::Buffer &dst_dram_buffer_1, + tt_metal::Buffer &dst_dram_buffer_2, + tt_metal::Buffer &dst_dram_buffer_3 +) { // Same readers args because all kernels read from same src const std::array unary_reader_args{ (std::uint32_t)src_dram_buffer.address(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)num_tiles}; const std::array unary_writer_args_1{ dst_dram_buffer_1.address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)num_tiles}; const std::array unary_writer_args_2{ dst_dram_buffer_2.address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)num_tiles}; const std::array unary_writer_args_3{ dst_dram_buffer_3.address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)num_tiles}; set_rt_args(program, reader_kernel_id, core_range, unary_reader_args); diff --git a/tests/tt_metal/tt_metal/test_multiple_programs.cpp b/tests/tt_metal/tt_metal/test_multiple_programs.cpp index c92fb35bc7b3..e73e7f423ed4 100644 --- a/tests/tt_metal/tt_metal/test_multiple_programs.cpp +++ b/tests/tt_metal/tt_metal/test_multiple_programs.cpp @@ -165,27 +165,13 @@ void write_program_runtime_args_to_device( tt_metal::Buffer& src0_dram_buffer, tt_metal::Buffer& src1_dram_buffer, tt_metal::Buffer& dst_dram_buffer) { - auto dram_src0_noc_xy = src0_dram_buffer.noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer.noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer.noc_coordinates(); - tt_metal::SetRuntimeArgs( program, reader_kernel_id, core, - {src0_dram_buffer.address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - src1_dram_buffer.address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - num_tiles}); + {src0_dram_buffer.address(), (uint32_t)0, src1_dram_buffer.address(), (uint32_t)0, num_tiles}); - tt_metal::SetRuntimeArgs( - program, - writer_kernel_id, - core, - {dst_dram_buffer.address(), (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, {dst_dram_buffer.address(), (uint32_t)0, num_tiles}); } ////////////////////////////////////////////////////////////////////////////////////////// // 1. First program runs eltwise binary on logical core {0, 0} diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp index fc242dd9379a..825bf5a8e568 100644 --- a/tests/tt_metal/tt_metal/test_transpose_hc.cpp +++ b/tests/tt_metal/tt_metal/test_transpose_hc.cpp @@ -68,10 +68,8 @@ int main(int argc, char** argv) { auto src0_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src0_addr = src0_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); uint32_t src0_cb_index = 0; uint32_t num_buffer_tiles = 2; @@ -130,27 +128,10 @@ int main(int argc, char** argv) { tt_metal::detail::WriteToBuffer(src0_dram_buffer, src0_vec); tt_metal::SetRuntimeArgs( - program, - reader_kernel, - core, - {dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - W, - H, - C, - HW, - N, - CHW}); + program, reader_kernel, core, {dram_buffer_src0_addr, (uint32_t)0, W, H, C, HW, N, CHW}); tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tensor_tiles}); + program, unary_writer_kernel, core, {dram_buffer_dst_addr, (uint32_t)0, num_tensor_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp index 890315de6fec..91ea7efc41bb 100644 --- a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp @@ -122,10 +122,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = num_tiles_c; tt_metal::CircularBufferConfig cb_src0_config = @@ -200,21 +196,9 @@ int main(int argc, char** argv) { program, binary_reader_kernel, core, - {dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - num_tiles, - dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - num_tiles, - 0}); + {dram_buffer_src0_addr, (uint32_t)0, num_tiles, dram_buffer_src1_addr, (uint32_t)0, num_tiles, 0}); - tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dram_buffer_dst_addr, (uint32_t)0, num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt index f7be3f8b1be8..bf3bd0068ab7 100644 --- a/tt_metal/common/CMakeLists.txt +++ b/tt_metal/common/CMakeLists.txt @@ -1,4 +1,5 @@ set(COMMON_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/core_assignment.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp diff --git a/tt_metal/common/core_assignment.cpp b/tt_metal/common/core_assignment.cpp new file mode 100644 index 000000000000..6131b31c9d87 --- /dev/null +++ b/tt_metal/common/core_assignment.cpp @@ -0,0 +1,230 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "core_assignment.hpp" + +namespace tt { +namespace tt_metal { + +void reassign_dram_interface_cores_for_grayskull( + const std::vector& non_worker_rows, + std::vector& dram_interface_workers, + uint32_t full_grid_size_y) { + // Reassign optimally placed DRAM Interface worker cores based on harvesting for GS + for (auto& coord : dram_interface_workers) { + // if row is harvested, move core down by 1 + while (std::find(non_worker_rows.begin(), non_worker_rows.end(), coord.y) != non_worker_rows.end() and + coord.y < (full_grid_size_y - 1)) { + coord.y += 1; + } + } +} + +std::vector reassign_dram_interface_cores_for_wormhole( + const std::vector& non_worker_rows, + const std::vector& dram_interface_workers, + uint32_t num_dram_banks, + uint32_t max_worker_y_physical, + uint32_t min_worker_y_physical) { + // Reassign optimally placed DRAM Interface worker cores based on harvesting for WH + std::vector dram_interface_workers_g1; + std::vector dram_interface_workers_g2; + std::vector dram_interface_worker_y_coords_g1; + std::vector dram_interface_worker_y_coords_g2; + + dram_interface_workers_g1.reserve(num_dram_banks); + dram_interface_worker_y_coords_g1.reserve(num_dram_banks); + dram_interface_workers_g2.reserve(num_dram_banks); + dram_interface_worker_y_coords_g2.reserve(num_dram_banks); + + // Separate Workers into 2 groups based on which DRAM column they are meant to interface with + for (const auto& core : dram_interface_workers) { + if (core.x == dram_interface_workers.front().x) { + dram_interface_workers_g1.push_back(core); + } else { + dram_interface_workers_g2.push_back(core); + } + } + + // Track the indices of the workers inside each group + std::vector indices_g1(dram_interface_workers_g1.size()); + std::vector indices_g2(dram_interface_workers_g2.size()); + std::iota(indices_g1.begin(), indices_g1.end(), 0); + std::iota(indices_g2.begin(), indices_g2.end(), 0); + + // Sort workers and associated group indices based on y coord + std::sort(indices_g1.begin(), indices_g1.end(), [&dram_interface_workers_g1](int i1, int i2) { + return dram_interface_workers_g1[i1].y < dram_interface_workers_g1[i2].y; + }); + std::sort(indices_g2.begin(), indices_g2.end(), [&dram_interface_workers_g2](int i1, int i2) { + return dram_interface_workers_g2[i1].y < dram_interface_workers_g2[i2].y; + }); + std::sort( + dram_interface_workers_g1.begin(), dram_interface_workers_g1.end(), [](const CoreCoord& a, const CoreCoord& b) { + return a.y < b.y; + }); + std::sort( + dram_interface_workers_g2.begin(), dram_interface_workers_g2.end(), [](const CoreCoord& a, const CoreCoord& b) { + return a.y < b.y; + }); + // Place the bottom-most worker and associated index at the start of the group + std::rotate( + dram_interface_workers_g1.begin(), dram_interface_workers_g1.end() - 1, dram_interface_workers_g1.end()); + std::rotate( + dram_interface_workers_g2.begin(), dram_interface_workers_g2.end() - 1, dram_interface_workers_g2.end()); + std::rotate(indices_g1.begin(), indices_g1.end() - 1, indices_g1.end()); + std::rotate(indices_g2.begin(), indices_g2.end() - 1, indices_g2.end()); + + // Track the shuffled indices + std::vector indices_g1_realloc(dram_interface_workers_g1.size()); + std::vector indices_g2_realloc(dram_interface_workers_g2.size()); + for (int new_index = 0; new_index < indices_g1.size(); ++new_index) { + indices_g1_realloc[indices_g1[new_index]] = new_index; + } + for (int new_index = 0; new_index < indices_g2.size(); ++new_index) { + indices_g2_realloc[indices_g2[new_index]] = new_index; + } + // Extract worker y coordinates per group + for (auto core : dram_interface_workers_g1) { + dram_interface_worker_y_coords_g1.push_back(core.y); + } + for (auto core : dram_interface_workers_g2) { + dram_interface_worker_y_coords_g2.push_back(core.y); + } + uint32_t x_step = 3; + // Helper function to shift harvested workers + auto shift_group_based_on_harvesting = [&](std::vector& group, + std::vector& group_y, + uint32_t x_step) { + for (auto& coord : group) { + auto y = coord.y; + + if (std::find(non_worker_rows.begin(), non_worker_rows.end(), y) != non_worker_rows.end() || + std::count(group_y.begin(), group_y.end(), y) >= 2) { + auto shift_coord_based_on_harvesting = [&](int start, int end, int step) { + bool found_new_row = false; + for (int j = start; step > 0 ? j <= end : j >= end; j += step) { + if (std::find(non_worker_rows.begin(), non_worker_rows.end(), j) == non_worker_rows.end() && + std::count(group_y.begin(), group_y.end(), j) == 0) { + coord.y = j; + coord.x += x_step; + x_step--; + found_new_row = true; + break; + } + } + if (not found_new_row) { + for (int j = start; step > 0 ? j <= end : j >= end; j += step) { + if (std::find(non_worker_rows.begin(), non_worker_rows.end(), j) == non_worker_rows.end()) { + coord.y = j; + coord.x += x_step; + x_step--; + found_new_row = true; + break; + } + } + } + }; + + if (y >= num_dram_banks - 1) { + shift_coord_based_on_harvesting(max_worker_y_physical, min_worker_y_physical, -1); + } else { + shift_coord_based_on_harvesting(min_worker_y_physical, max_worker_y_physical, 1); + } + } + } + }; + // Shift harvested workers + shift_group_based_on_harvesting(dram_interface_workers_g1, dram_interface_worker_y_coords_g1, x_step); + shift_group_based_on_harvesting(dram_interface_workers_g2, dram_interface_worker_y_coords_g2, x_step); + + // Merge both groups based on original indices (maintain ordering by dram bank_id here) + std::vector shifted_dram_interface_workers; + shifted_dram_interface_workers.reserve(num_dram_banks); + for (int i = 0; i < indices_g1_realloc.size(); ++i) { + shifted_dram_interface_workers.push_back(dram_interface_workers_g1[indices_g1_realloc[i]]); + } + for (int i = 0; i < indices_g2_realloc.size(); ++i) { + shifted_dram_interface_workers.push_back(dram_interface_workers_g2[indices_g2_realloc[i]]); + } + return shifted_dram_interface_workers; +} + +void reassign_dram_interface_cores_for_blackhole( + const std::vector& harvested_cols, + std::vector& dram_interface_workers, + uint32_t full_grid_size_x) { + for (auto& coord : dram_interface_workers) { + // if col is harvested, move core right by 1 + while (std::find(harvested_cols.begin(), harvested_cols.end(), coord.x) != harvested_cols.end() and + coord.x < (full_grid_size_x - 1)) { + coord.x += 1; + } + } +} + +std::vector get_optimal_dram_to_physical_worker_assignment( + ARCH arch, + const std::vector& dram_phy_coords, + uint32_t full_grid_size_x, + uint32_t full_grid_size_y, + std::vector worker_phy_x, + std::vector worker_phy_y) { + // Reassign optimally placed DRAM Interface worker cores based on harvesting for BH + std::vector non_worker_rows; + std::vector non_worker_cols; + uint32_t max_worker_y_physical = 0; + uint32_t min_worker_y_physical = std::numeric_limits::max(); + // For GS and WH, rows are harvested. Track them here. + if (arch == ARCH::GRAYSKULL or arch == ARCH::WORMHOLE_B0) { + for (int y_coord = 0; y_coord < full_grid_size_y; ++y_coord) { + if (std::find(worker_phy_y.begin(), worker_phy_y.end(), y_coord) == worker_phy_y.end()) { + non_worker_rows.push_back(y_coord); + } + if (y_coord > max_worker_y_physical) { + max_worker_y_physical = y_coord; + } + if (y_coord < min_worker_y_physical) { + min_worker_y_physical = y_coord; + } + } + } + std::vector dram_interface_workers; + uint32_t num_dram_banks = dram_phy_coords.size(); + // Get the optimal dram -> worker configuration here. + // For GS, worker cores are placed below the DRAM Controller. + // For WH, worker cores are placed to the right of the DRAM Controller. + for (int i = 0; i < num_dram_banks; ++i) { + auto dram_core = dram_phy_coords[i]; + if (arch == ARCH::GRAYSKULL) { + dram_interface_workers.push_back(CoreCoord(dram_core.x, dram_core.y + 1)); + } else if (arch == ARCH::WORMHOLE_B0 or arch == ARCH::BLACKHOLE) { + dram_interface_workers.push_back(CoreCoord(dram_core.x + 1, dram_core.y)); + } + } + + if (arch == ARCH::GRAYSKULL) { + // Reassign worker cores based on harvesting for GS. + reassign_dram_interface_cores_for_grayskull(non_worker_rows, dram_interface_workers, full_grid_size_y); + return dram_interface_workers; + } else if (arch == ARCH::WORMHOLE_B0) { + // Reassign worker cores based on harvesting for WH. + return reassign_dram_interface_cores_for_wormhole( + non_worker_rows, dram_interface_workers, num_dram_banks, max_worker_y_physical, min_worker_y_physical); + } else if (arch == ARCH::BLACKHOLE) { + // Reassign worker cores based on harvesting for BH. + // Need to account for column harvesting here. + for (int x_coord = 0; x_coord < full_grid_size_x; ++x_coord) { + if (std::find(worker_phy_x.begin(), worker_phy_x.end(), x_coord) == worker_phy_x.end()) { + non_worker_cols.push_back(x_coord); + } + } + reassign_dram_interface_cores_for_blackhole(non_worker_cols, dram_interface_workers, full_grid_size_x); + return dram_interface_workers; + } + TT_THROW("Invalid Arch Name specified"); +} + +} // namespace tt_metal +} // namespace tt diff --git a/tt_metal/common/core_assignment.hpp b/tt_metal/common/core_assignment.hpp new file mode 100644 index 000000000000..d10bcdd3a10c --- /dev/null +++ b/tt_metal/common/core_assignment.hpp @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "core_coord.hpp" +#include "tt_metal/llrt/tt_cluster.hpp" + +namespace tt { +namespace tt_metal { +// Returns an ordered list of DRAM Bank ID to optimally placed worker cores. Placing DRAM reader or writer +// kernels on these worker cores will minimize NOC congestion and the number of NOC hops required to complete +// a DRAM read or write. +// Worker cores are derived based on architecture, harvesting configurations and DRAM Controller placement. +std::vector get_optimal_dram_to_physical_worker_assignment( + ARCH arch, + const std::vector& dram_phy_coords, + uint32_t full_grid_size_x, + uint32_t full_grid_size_y, + std::vector worker_phy_x, + std::vector worker_phy_y); + +} // namespace tt_metal +} // namespace tt diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp index d14d9f9a4456..7a7dc9b848d6 100644 --- a/tt_metal/common/core_descriptor.hpp +++ b/tt_metal/common/core_descriptor.hpp @@ -171,13 +171,4 @@ inline const std::vector& get_logical_dispatch_cores( return logical_dispatch_cores; } -/// @brief Get physical core coordinate from a logical location (device ID + core coordinate) -/// @param logical_location tt_cxy_pair describing chip and logical location core coordinate -/// @param core_type CoreType of core to translate -/// @return physical CoreCoord on the same chip as `logical_location` -inline CoreCoord get_physical_core_coordinate(const tt_cxy_pair& logical_location, const CoreType& core_type) { - const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(logical_location.chip); - return soc_desc.get_physical_core_from_logical_core(CoreCoord(logical_location.x, logical_location.y), core_type); -} - } // namespace tt diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 5554f2edcf37..f376b9746e7d 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -392,9 +392,10 @@ int main() { // Querying the noc_index is safe here, since the RUN_MSG_RESET_READ_PTR go signal is currently guaranteed // to only be seen after a RUN_MSG_GO signal, which will set the noc_index to a valid value. // For future proofing, the noc_index value is initialized to 0, to ensure an invalid NOC txn is not issued. - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); mailboxes->go_message.signal = RUN_MSG_DONE; // Notify dispatcher that this has been done DEBUG_SANITIZE_NOC_ADDR(noc_index, dispatch_addr, 4); @@ -512,9 +513,10 @@ int main() { if (launch_msg_address->kernel_config.mode == DISPATCH_MODE_DEV) { // Set launch message to invalid, so that the next time this slot is encountered, kernels are only run if a valid launch message is sent. launch_msg_address->kernel_config.enables = 0; - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); DEBUG_SANITIZE_NOC_ADDR(noc_index, dispatch_addr, 4); // Only executed if watcher is enabled. Ensures that we don't report stale data due to invalid launch // messages in the ring buffer. Must be executed before the atomic increment, as after that the launch diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index 44d760a069c2..883b615c9c7e 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -92,9 +92,10 @@ void __attribute__((noinline)) Application(void) { if (launch_msg_address->kernel_config.mode == DISPATCH_MODE_DEV) { launch_msg_address->kernel_config.enables = 0; - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); CLEAR_PREVIOUS_LAUNCH_MESSAGE_ENTRY_FOR_WATCHER(); internal_::notify_dispatch_core_done(dispatch_addr); mailboxes->launch_msg_rd_ptr = (launch_msg_rd_ptr + 1) & (launch_msg_buffer_num_entries - 1); @@ -105,9 +106,10 @@ void __attribute__((noinline)) Application(void) { } else if (go_message_signal == RUN_MSG_RESET_READ_PTR) { // Reset the launch message buffer read ptr mailboxes->launch_msg_rd_ptr = 0; - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); mailboxes->go_message.signal = RUN_MSG_DONE; internal_::notify_dispatch_core_done(dispatch_addr); } else { diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index 455629e95c7b..a425dd5c49d7 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -170,9 +170,10 @@ int main() { // Notify dispatcher core that it has completed if (launch_msg_address->kernel_config.mode == DISPATCH_MODE_DEV) { launch_msg_address->kernel_config.enables = 0; - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_x), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_x), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); DEBUG_SANITIZE_NOC_ADDR(noc_index, dispatch_addr, 4); CLEAR_PREVIOUS_LAUNCH_MESSAGE_ENTRY_FOR_WATCHER(); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/); diff --git a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h index 265466d2f282..d32631565f0b 100644 --- a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h +++ b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h @@ -5,6 +5,11 @@ #ifndef _NOC_PARAMETERS_H_ #define _NOC_PARAMETERS_H_ +// Coordinate Virtualization is not currently supported on BH (requires syseng support for updating FW). +#define VIRTUAL_TENSIX_START_X 0 +#define VIRTUAL_TENSIX_START_Y 0 +#define COORDINATE_VIRTUALIZATION_ENABLED 0 + #define NUM_NOCS 2 #define NUM_TENSIXES 140 diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h index e21d47417c6f..27e4217f00da 100644 --- a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h @@ -14,6 +14,10 @@ #define NOC_0_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) #define NOC_0_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) +#define NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) NOC_0_X(noc_index, noc_size_x, x) +#define NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) NOC_0_Y(noc_index, noc_size_y, y) +#define MY_NOC_ENCODING(noc_index) NOC_CMD_BUF_READ_REG(noc_index, 0, NOC_NODE_ID) + //// /*TODO: RT review this file, currently using wormhole b0 copy, check if any changes needed for BH*/ constexpr uint32_t DYNAMIC_NOC_NCRISC_WR_CMD_BUF = 2; // all writes share cmd buf diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index c6877db3e0a1..bd256d6a9ad1 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -2006,3 +2006,17 @@ void noc_async_read_barrier_with_trid(uint32_t trid, uint8_t noc = noc_index) { #endif WAYPOINT("NBTD"); } + +template +FORCE_INLINE +uint64_t get_noc_addr_from_bank_id(uint32_t bank_id, uint32_t bank_address_offset, uint8_t noc = noc_index) { + // Use addrgen tables to convert bank_ids to physical NOC coordinates + uint64_t noc_addr = 0; + if constexpr (DRAM) { + noc_addr = dram_bank_to_noc_xy[noc_index][bank_id]; + bank_address_offset += bank_to_dram_offset[bank_id]; + } else { + noc_addr = l1_bank_to_noc_xy[noc_index][bank_id]; + } + return (noc_addr << NOC_ADDR_COORD_SHIFT) | (bank_address_offset); +} diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h index f01427da4921..3a1cc835f025 100644 --- a/tt_metal/hw/inc/debug/sanitize_noc.h +++ b/tt_metal/hw/inc/debug/sanitize_noc.h @@ -41,38 +41,77 @@ typedef bool debug_sanitize_noc_cast_t; typedef bool debug_sanitize_noc_which_core_t; // Helper function to get the core type from noc coords. -AddressableCoreType get_core_type(uint8_t noc_id, uint8_t x, uint8_t y) { +AddressableCoreType get_core_type(uint8_t noc_id, uint8_t x, uint8_t y, bool& is_virtual_coord) { core_info_msg_t tt_l1_ptr* core_info = GET_MAILBOX_ADDRESS_DEV(core_info); - + // Check if the target NOC endpoint is a valid non-Tensix core in the Physical Coordinate Space for (uint32_t idx = 0; idx < MAX_NON_WORKER_CORES; idx++) { uint8_t core_x = core_info->non_worker_cores[idx].x; uint8_t core_y = core_info->non_worker_cores[idx].y; - if (x == NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)core_x) && - y == NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)core_y)) { + if (x == NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)core_x) && + y == NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)core_y)) { + is_virtual_coord = false; return core_info->non_worker_cores[idx].type; } } + if constexpr (COORDINATE_VIRTUALIZATION_ENABLED) { + // Was not a valid non-Tensix Physical Coordinate. Check if endpoint maps to a valid non-worker Virtual + // Coordinate. + for (uint32_t idx = 0; idx < MAX_VIRTUAL_NON_WORKER_CORES; idx++) { + uint8_t core_x = core_info->virtual_non_worker_cores[idx].x; + uint8_t core_y = core_info->virtual_non_worker_cores[idx].y; + + if (x == NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)core_x) && + y == NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)core_y)) { + is_virtual_coord = true; + return core_info->virtual_non_worker_cores[idx].type; + } + } + } + // Check if coordinate maps to a harvested row in the physical space. for (uint32_t idx = 0; idx < MAX_HARVESTED_ROWS; idx++) { uint16_t harvested_y = core_info->harvested_y[idx]; - if (y == NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)harvested_y)) { + if (y == NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)harvested_y)) { + is_virtual_coord = false; return AddressableCoreType::HARVESTED; } } + if constexpr (COORDINATE_VIRTUALIZATION_ENABLED) { + // Check if coordinate maps to a harvested row in the virtual space. + for (uint32_t idx = 0; idx < MAX_HARVESTED_ROWS; idx++) { + uint16_t virtual_harvested_y = core_info->virtual_harvested_y[idx]; + if (y == NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)virtual_harvested_y)) { + is_virtual_coord = true; + return AddressableCoreType::HARVESTED; + } + } + } - // Tensix + // Check if NOC endpoint is valid in the Tensix Physical Coordinate Space. if (noc_id == 0) { - if (x >= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)1) && - x <= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)core_info->noc_size_x - 1) && - y >= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)1) && - y <= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)core_info->noc_size_y - 1)) { + if (x >= NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)0) && + x <= NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)core_info->noc_size_x - 1) && + y >= NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)0) && + y <= NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)core_info->noc_size_y - 1)) { + is_virtual_coord = false; return AddressableCoreType::TENSIX; } } else { - if (x <= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)1) && - x >= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)core_info->noc_size_x - 1) && - y <= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)1) && - y >= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)core_info->noc_size_y - 1)) { + if (x <= NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)0) && + x >= NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)core_info->noc_size_x - 1) && + y <= NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)0) && + y >= NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)core_info->noc_size_y - 1)) { + is_virtual_coord = false; + return AddressableCoreType::TENSIX; + } + } + if constexpr (COORDINATE_VIRTUALIZATION_ENABLED) { + // Check if NOC endpoint is valid in the Tensix Virtual Coordinate Space. + if (x >= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)VIRTUAL_TENSIX_START_X) && + x <= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)VIRTUAL_TENSIX_START_X + core_info->noc_size_x - 1) && + y >= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)VIRTUAL_TENSIX_START_Y) && + y <= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)VIRTUAL_TENSIX_START_Y + core_info->noc_size_y - 1)) { + is_virtual_coord = true; return AddressableCoreType::TENSIX; } } @@ -210,22 +249,40 @@ uint32_t debug_sanitize_noc_addr( y = (uint8_t)NOC_UNICAST_ADDR_Y(noc_addr); } uint64_t noc_local_addr = NOC_LOCAL_ADDR(noc_addr); - AddressableCoreType core_type = get_core_type(noc_id, x, y); - + bool is_virtual_coord = false; + AddressableCoreType core_type = get_core_type(noc_id, x, y, is_virtual_coord); // Extra check for multicast if (multicast) { uint8_t x_end = (uint8_t)NOC_MCAST_ADDR_END_X(noc_addr); uint8_t y_end = (uint8_t)NOC_MCAST_ADDR_END_Y(noc_addr); - - AddressableCoreType end_core_type = get_core_type(noc_id, x_end, y_end); + bool is_virtual_coord_end = false; + AddressableCoreType end_core_type = get_core_type(noc_id, x_end, y_end, is_virtual_coord_end); // Multicast supports workers only uint16_t return_code = DebugSanitizeNocOK; if (core_type != AddressableCoreType::TENSIX || end_core_type != AddressableCoreType::TENSIX) { return_code = DebugSanitizeNocMulticastNonWorker; } - if (x > x_end || y > y_end) { - return_code = DebugSanitizeNocMulticastInvalidRange; + if (is_virtual_coord != is_virtual_coord_end) { + return_code = DebugSanitizeNocMixedVirtualandPhysical; + } + if (is_virtual_coord && is_virtual_coord_end) { + // If coordinates are in virtual space, start can be greater than end, when using NOC1. + // This is because NOC0 and NOC1 endpoints are identical in virtual space, but order of + // start and end coords is still flipped between NOC0 and NOC1. + if (noc_id == 0) { + if (x > x_end || y > y_end) { + return_code = DebugSanitizeNocMulticastInvalidRange; + } + } else { + if (x_end > x || y_end > y) { + return_code = DebugSanitizeNocMulticastInvalidRange; + } + } + } else { + if (x > x_end || y > y_end) { + return_code = DebugSanitizeNocMulticastInvalidRange; + } } debug_sanitize_post_noc_addr_and_hang( noc_id, noc_addr, l1_addr, noc_len, multicast, dir, DEBUG_SANITIZE_NOC_TARGET, return_code); diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index 40d1cacf48f7..09c28fa93a04 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -184,6 +184,7 @@ enum debug_sanitize_noc_return_code_enum { DebugSanitizeNocMulticastNonWorker = 7, DebugSanitizeNocMulticastInvalidRange = 8, DebugSanitizeNocAlignment = 9, + DebugSanitizeNocMixedVirtualandPhysical = 10, }; struct debug_assert_msg_t { @@ -299,7 +300,12 @@ struct addressable_core_t { }; // TODO: This can move into the hal eventually, currently sized for WH. -constexpr static std::uint32_t MAX_NON_WORKER_CORES = 36 + 1 + 16; +// This is the number of Ethernet cores on WH (Ethernet cores can be queried through Virtual Coordinates). +// All other Non Worker Cores are not accessible through virtual coordinates. Subject to change, depending on the arch. +constexpr static std::uint32_t MAX_VIRTUAL_NON_WORKER_CORES = 18; +// This is the total number of Non Worker Cores on WH (first term is Ethernet, second term is PCIe and last term is +// DRAM). +constexpr static std::uint32_t MAX_NON_WORKER_CORES = MAX_VIRTUAL_NON_WORKER_CORES + 1 + 16; constexpr static std::uint32_t MAX_HARVESTED_ROWS = 2; constexpr static std::uint8_t CORE_COORD_INVALID = 0xFF; struct core_info_msg_t { @@ -308,10 +314,12 @@ struct core_info_msg_t { volatile uint64_t noc_dram_addr_base; volatile uint64_t noc_dram_addr_end; addressable_core_t non_worker_cores[MAX_NON_WORKER_CORES]; + addressable_core_t virtual_non_worker_cores[MAX_VIRTUAL_NON_WORKER_CORES]; volatile uint8_t harvested_y[MAX_HARVESTED_ROWS]; + volatile uint8_t virtual_harvested_y[MAX_HARVESTED_ROWS]; volatile uint8_t noc_size_x; volatile uint8_t noc_size_y; - volatile uint8_t pad[29]; + volatile uint8_t pad[27]; }; constexpr uint32_t launch_msg_buffer_num_entries = 4; diff --git a/tt_metal/hw/inc/grayskull/noc/noc_parameters.h b/tt_metal/hw/inc/grayskull/noc/noc_parameters.h index 7eff21e6dbd1..dbba51a83efd 100644 --- a/tt_metal/hw/inc/grayskull/noc/noc_parameters.h +++ b/tt_metal/hw/inc/grayskull/noc/noc_parameters.h @@ -13,6 +13,11 @@ #define NOC_Y_SIZE 1 #endif +// Coordinate Virtualization is not supported on GS (feature does not exist in NOC Hardware). +#define VIRTUAL_TENSIX_START_X 0 +#define VIRTUAL_TENSIX_START_Y 0 +#define COORDINATE_VIRTUALIZATION_ENABLED 0 + #define NUM_NOCS 2 #define NUM_TENSIXES 120 diff --git a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h index d9279dbbb521..1f709b8a5527 100644 --- a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h @@ -14,6 +14,9 @@ #define NOC_0_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) #define NOC_0_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) +#define NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) NOC_0_X(noc_index, noc_size_x, x) +#define NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) NOC_0_Y(noc_index, noc_size_y, y) +#define MY_NOC_ENCODING(noc_index) NOC_CMD_BUF_READ_REG(noc_index, 0, NOC_NODE_ID) //// constexpr uint32_t DYNAMIC_NOC_NCRISC_WR_CMD_BUF = 2; // all writes share cmd buf diff --git a/tt_metal/hw/inc/risc_common.h b/tt_metal/hw/inc/risc_common.h index 2407324dd539..a68e5c9cb4d8 100644 --- a/tt_metal/hw/inc/risc_common.h +++ b/tt_metal/hw/inc/risc_common.h @@ -21,6 +21,8 @@ #define NOC_Y(y) NOC_0_Y(noc_index, noc_size_y, (y)) #define DYNAMIC_NOC_X(noc, x) NOC_0_X(noc, noc_size_x, (x)) #define DYNAMIC_NOC_Y(noc, y) NOC_0_Y(noc, noc_size_y, (y)) +#define NOC_X_PHYS_COORD(x) NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) +#define NOC_Y_PHYS_COORD(y) NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) #define TILE_WORD_2_BIT ((256 + 64 + 32) >> 4) #define TILE_WORD_4_BIT ((512 + 64 + 32) >> 4) @@ -139,7 +141,7 @@ inline uint32_t special_mult(uint32_t a, uint32_t special_b) { inline void risc_init() { for (uint32_t n = 0; n < NUM_NOCS; n++) { - uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(n, 0, NOC_NODE_ID); + uint32_t noc_id_reg = MY_NOC_ENCODING(n); my_x[n] = noc_id_reg & NOC_NODE_ID_MASK; my_y[n] = (noc_id_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK; } diff --git a/tt_metal/hw/inc/wormhole/noc/noc_parameters.h b/tt_metal/hw/inc/wormhole/noc/noc_parameters.h index 34c899447cfb..43d8c3428f36 100644 --- a/tt_metal/hw/inc/wormhole/noc/noc_parameters.h +++ b/tt_metal/hw/inc/wormhole/noc/noc_parameters.h @@ -13,6 +13,12 @@ #define NOC_Y_SIZE 1 #endif +// Coordinate Virtualization is fully supported by WH NOC Hardware and Firmware. +// Tensix cores start at coorddinate in Virtual Space and are contiguous. +#define VIRTUAL_TENSIX_START_X 18 +#define VIRTUAL_TENSIX_START_Y 18 +#define COORDINATE_VIRTUALIZATION_ENABLED 1 + #define NUM_NOCS 2 #define NUM_TENSIXES 80 diff --git a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h index f7a399670eb4..a1030a010992 100644 --- a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h @@ -11,9 +11,11 @@ #include "noc_overlay_parameters.h" // Helper functions to convert NoC coordinates to NoC-0 coordinates, used in metal as "physical" coordinates. -#define NOC_0_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) -#define NOC_0_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) - +#define NOC_0_X(noc_index, noc_size_x, x) x +#define NOC_0_Y(noc_index, noc_size_y, y) y +#define NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) +#define NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) +#define MY_NOC_ENCODING(noc_index) NOC_CMD_BUF_READ_REG(noc_index, 0, NOC_CFG(NOC_ID_LOGICAL)); //// // Use VC 1 for unicast writes, and VC 4 for mcast writes diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp index bdbbe8dd3f92..aa43d84043f6 100644 --- a/tt_metal/impl/allocator/allocator.cpp +++ b/tt_metal/impl/allocator/allocator.cpp @@ -537,8 +537,8 @@ void Allocator::reset() { void AllocatorConfig::reset() { dram_bank_offsets.clear(); core_type_from_noc_coord_table.clear(); - worker_log_to_physical_routing_x.clear(); - worker_log_to_physical_routing_y.clear(); + worker_log_to_virtual_routing_x.clear(); + worker_log_to_virtual_routing_y.clear(); l1_bank_remap.clear(); } diff --git a/tt_metal/impl/allocator/allocator_types.hpp b/tt_metal/impl/allocator/allocator_types.hpp index 2dc0e92816e5..b4ad6bf960c1 100644 --- a/tt_metal/impl/allocator/allocator_types.hpp +++ b/tt_metal/impl/allocator/allocator_types.hpp @@ -44,8 +44,8 @@ struct AllocatorConfig { size_t l1_small_size = 0; size_t trace_region_size = 0; std::unordered_map core_type_from_noc_coord_table = {}; - std::unordered_map worker_log_to_physical_routing_x = {}; - std::unordered_map worker_log_to_physical_routing_y = {}; + std::unordered_map worker_log_to_virtual_routing_x = {}; + std::unordered_map worker_log_to_virtual_routing_y = {}; BankMapping l1_bank_remap = {}; // for remapping which l1 bank points to which bank if we assume normal row-major assignment CoreRangeSet compute_grid = {}; diff --git a/tt_metal/impl/allocator/l1_banking_allocator.cpp b/tt_metal/impl/allocator/l1_banking_allocator.cpp index 0cd0c38d984a..c79cd949a440 100644 --- a/tt_metal/impl/allocator/l1_banking_allocator.cpp +++ b/tt_metal/impl/allocator/l1_banking_allocator.cpp @@ -75,17 +75,17 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca num_banks_t num_banks = compute_total_and_storage_only_num_l1_banks(alloc_config); auto logical_to_noc_coord = [&alloc_config](CoreCoord logical_core) { TT_ASSERT( - alloc_config.worker_log_to_physical_routing_x.find(logical_core.x) != - alloc_config.worker_log_to_physical_routing_x.end() and - alloc_config.worker_log_to_physical_routing_y.find(logical_core.y) != - alloc_config.worker_log_to_physical_routing_y.end(), + alloc_config.worker_log_to_virtual_routing_x.find(logical_core.x) != + alloc_config.worker_log_to_virtual_routing_x.end() and + alloc_config.worker_log_to_virtual_routing_y.find(logical_core.y) != + alloc_config.worker_log_to_virtual_routing_y.end(), "Cannot find log_coord=[.y={}, .x={}] in logical to routing coord lookup tables... invalid AllocatorConfig " "setup", logical_core.y, logical_core.x); CoreCoord noc_core({ - static_cast(alloc_config.worker_log_to_physical_routing_x.at(logical_core.x)), - static_cast(alloc_config.worker_log_to_physical_routing_y.at(logical_core.y)), + static_cast(alloc_config.worker_log_to_virtual_routing_x.at(logical_core.x)), + static_cast(alloc_config.worker_log_to_virtual_routing_y.at(logical_core.y)), }); TT_ASSERT( alloc_config.core_type_from_noc_coord_table.find(noc_core) != diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index b1e5ec3e3374..3502254d8d53 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -425,27 +425,6 @@ CoreCoord Buffer::logical_core_from_bank_id(uint32_t bank_id) const { return allocator::logical_core_from_bank_id(*this->allocator_, bank_id); } -CoreCoord Buffer::noc_coordinates(uint32_t bank_id) const { - switch (this->buffer_type_) { - case BufferType::DRAM: - case BufferType::TRACE: { - auto dram_channel = this->dram_channel_from_bank_id(bank_id); - return this->device_->dram_core_from_dram_channel(dram_channel); - } - case BufferType::L1: // fallthrough - case BufferType::L1_SMALL: { - auto logical_core = this->logical_core_from_bank_id(bank_id); - return this->device_->worker_core_from_logical_core(logical_core); - } - case BufferType::SYSTEM_MEMORY: { - TT_THROW("Host buffer is located in system memory! Cannot retrieve NoC coordinates for it"); - } break; - default: TT_THROW("Unsupported buffer type!"); - } -} - -CoreCoord Buffer::noc_coordinates() const { return this->noc_coordinates(0); } - DeviceAddr Buffer::page_address(uint32_t bank_id, uint32_t page_index) const { uint32_t num_banks = allocator::num_banks(*this->allocator_, this->buffer_type_); TT_FATAL(bank_id < num_banks, "Invalid Bank ID: {} exceeds total numbers of banks ({})!", bank_id, num_banks); @@ -454,6 +433,21 @@ DeviceAddr Buffer::page_address(uint32_t bank_id, uint32_t page_index) const { return translate_page_address(offset, bank_id); } +DeviceAddr Buffer::bank_local_page_address(uint32_t bank_id, uint32_t page_index) const { + uint32_t num_banks = allocator::num_banks(*this->allocator_, this->buffer_type_); + TT_FATAL(bank_id < num_banks, "Invalid Bank ID: {} exceeds total numbers of banks ({})!", bank_id, num_banks); + uint32_t offset; + if (is_sharded(this->buffer_layout())) { + auto shard_spec = this->shard_spec(); + uint32_t pages_offset_within_bank = page_index % shard_spec.size(); + offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank); + } else { + uint32_t pages_offset_within_bank = page_index / num_banks; + offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank); + } + return this->address() + offset; +} + uint32_t Buffer::alignment() const { return this->allocator_->config.alignment; } diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp index d3fdc9f60aa3..2866dbcceb6a 100644 --- a/tt_metal/impl/buffers/buffer.hpp +++ b/tt_metal/impl/buffers/buffer.hpp @@ -221,13 +221,9 @@ class Buffer final { CoreCoord logical_core_from_bank_id(uint32_t bank_id) const; - CoreCoord noc_coordinates(uint32_t bank_id) const; - - // returns NoC coordinates of first bank buffer is in - CoreCoord noc_coordinates() const; - DeviceAddr page_address(uint32_t bank_id, uint32_t page_index) const; + DeviceAddr bank_local_page_address(uint32_t bank_id, uint32_t page_index) const; uint32_t alignment() const; DeviceAddr aligned_page_size() const; DeviceAddr aligned_size() const; diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp index aac472a1d1a2..7a72c8758b6b 100644 --- a/tt_metal/impl/debug/dprint_server.cpp +++ b/tt_metal/impl/debug/dprint_server.cpp @@ -551,7 +551,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // skip prints entirely to prevent kernel code from hanging waiting for the print buffer to be // flushed from the host. for (auto& logical_core : all_cores) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { WriteInitMagic(device, phys_core, hart_index, false); } @@ -623,7 +623,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { CoreCoord phys_core; bool valid_logical_core = true; try { - phys_core = device->physical_core_from_logical_core(logical_core, core_type); + phys_core = device->virtual_core_from_logical_core(logical_core, core_type); } catch (std::runtime_error& error) { valid_logical_core = false; } @@ -653,7 +653,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // Write print enable magic for the cores the user specified. uint32_t hart_mask = tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDprint); for (auto& logical_core : print_cores_sanitized) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { if (hart_mask & (1 << hart_index)) { WriteInitMagic(device, phys_core, hart_index, true); @@ -700,7 +700,7 @@ void DebugPrintServerContext::DetachDevice(Device* device) { // Check all dprint-enabled cores on this device for outstanding prints. outstanding_prints = false; for (auto& logical_core : device_to_core_range_.at(device)) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { if (risc_mask & (1 << risc_id)) { // No need to check if risc is not dprint-enabled. @@ -762,7 +762,7 @@ void DebugPrintServerContext::DetachDevice(Device* device) { // When detaching a device, disable prints on it. CoreDescriptorSet all_cores = GetAllCores(device); for (auto& logical_core : all_cores) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { WriteInitMagic(device, phys_core, hart_index, false); } @@ -791,7 +791,7 @@ void DebugPrintServerContext::ClearSignals() { bool DebugPrintServerContext::PeekOneHartNonBlocking( Device* device, const CoreDescriptor& logical_core, int hart_id, bool new_data_this_iter) { // If init magic isn't cleared for this risc, then dprint isn't enabled on it, don't read it. - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); if (!CheckInitMagicCleared(device, phys_core, hart_id)) { return false; } diff --git a/tt_metal/impl/debug/noc_logging.cpp b/tt_metal/impl/debug/noc_logging.cpp index bd73b0a26715..ea14307f2379 100644 --- a/tt_metal/impl/debug/noc_logging.cpp +++ b/tt_metal/impl/debug/noc_logging.cpp @@ -39,7 +39,7 @@ void PrintNocData(noc_data_t noc_data, const string& file_name) { } void DumpCoreNocData(Device* device, const CoreDescriptor& logical_core, noc_data_t& noc_data) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { // Read out the DPRINT buffer, we stored our data in the "data field" uint64_t addr = GetDprintBufAddr(device, phys_core, risc_id); @@ -98,7 +98,7 @@ void ClearNocData(Device* device) { CoreDescriptorSet all_cores = GetAllCores(device); for (const CoreDescriptor& logical_core : all_cores) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { uint64_t addr = GetDprintBufAddr(device, phys_core, risc_id); std::vector initbuf = std::vector(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0); diff --git a/tt_metal/impl/debug/sanitize_noc_host.hpp b/tt_metal/impl/debug/sanitize_noc_host.hpp index cea090ff1974..860bb2fca591 100644 --- a/tt_metal/impl/debug/sanitize_noc_host.hpp +++ b/tt_metal/impl/debug/sanitize_noc_host.hpp @@ -32,6 +32,10 @@ static bool coord_found_p(CoreCoord range, CoreCoord core) { return core.x >= 1 && core.x <= range.x && core.y >= 1 && core.y <= range.y; } +static bool coord_found_p(std::unordered_set coords, CoreCoord core) { + return coords.find(core) != coords.end(); +} + static string noc_address(CoreCoord core, uint64_t a, uint32_t l) { std::stringstream ss; ss << "noc{" << core.str() << ", 0x" << std::setfill('0') << std::setw(8) << std::hex << a << ", " << std::dec << l @@ -55,7 +59,13 @@ static void print_stack_trace(void) { } static void watcher_sanitize_host_noc( - const char* what, const metal_SocDescriptor& soc_d, const CoreCoord& core, uint64_t addr, uint32_t lbytes) { + const char* what, + const metal_SocDescriptor& soc_d, + const std::unordered_set& virtual_worker_cores, + const std::unordered_set& virtual_eth_cores, + const CoreCoord& core, + uint64_t addr, + uint32_t lbytes) { if (coord_found_p(soc_d.get_pcie_cores(), core)) { TT_THROW("Host watcher: bad {} NOC coord {}", what, core.str()); } else if (coord_found_p(soc_d.get_dram_cores(), core)) { @@ -66,12 +76,12 @@ static void watcher_sanitize_host_noc( print_stack_trace(); TT_THROW("Host watcher: bad {} dram address {}", what, noc_address(core, addr, lbytes)); } - } else if (coord_found_p(soc_d.get_physical_ethernet_cores(), core)) { + } else if (coord_found_p(virtual_eth_cores, core)) { if (!DEBUG_VALID_ETH_ADDR(addr, lbytes)) { print_stack_trace(); TT_THROW("Host watcher: bad {} eth address {}", what, noc_address(core, addr, lbytes)); } - } else if (coord_found_p(soc_d.grid_size, core)) { + } else if (coord_found_p(virtual_worker_cores, core)) { if (!DEBUG_VALID_WORKER_ADDR(addr, lbytes)) { print_stack_trace(); TT_THROW("Host watcher: bad {} worker address {}", what, noc_address(core, addr, lbytes)); @@ -84,13 +94,23 @@ static void watcher_sanitize_host_noc( } void watcher_sanitize_host_noc_read( - const metal_SocDescriptor& soc_d, const CoreCoord& core, uint64_t addr, uint32_t lbytes) { - watcher_sanitize_host_noc("read", soc_d, core, addr, lbytes); + const metal_SocDescriptor& soc_d, + const std::unordered_set& virtual_worker_cores, + const std::unordered_set& virtual_eth_cores, + const CoreCoord& core, + uint64_t addr, + uint32_t lbytes) { + watcher_sanitize_host_noc("read", soc_d, virtual_worker_cores, virtual_eth_cores, core, addr, lbytes); } void watcher_sanitize_host_noc_write( - const metal_SocDescriptor& soc_d, const CoreCoord& core, uint64_t addr, uint32_t lbytes) { - watcher_sanitize_host_noc("write", soc_d, core, addr, lbytes); + const metal_SocDescriptor& soc_d, + const std::unordered_set& virtual_worker_cores, + const std::unordered_set& virtual_eth_cores, + const CoreCoord& core, + uint64_t addr, + uint32_t lbytes) { + watcher_sanitize_host_noc("write", soc_d, virtual_worker_cores, virtual_eth_cores, core, addr, lbytes); } } // namespace tt diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp index d764bf8a2a62..c03b7548b79c 100644 --- a/tt_metal/impl/debug/watcher_device_reader.cpp +++ b/tt_metal/impl/debug/watcher_device_reader.cpp @@ -76,15 +76,11 @@ static uint32_t get_riscv_stack_size(const CoreDescriptor& core, uint32_t type) static string get_noc_target_str( Device* device, CoreDescriptor& core, int noc, const debug_sanitize_noc_addr_msg_t* san) { auto get_core_and_mem_type = [](Device* device, CoreCoord& noc_coord, int noc) -> std::pair { - // Get the physical coord from the noc coord - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - CoreCoord phys_core = { - tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, noc_coord.x), - tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, noc_coord.y)}; - + // Get the virtual coord from the noc coord + CoreCoord virtual_core = device->virtual_noc_coordinate(noc, noc_coord); CoreType core_type; try { - core_type = device->core_type_from_physical_core(phys_core); + core_type = device->core_type_from_virtual_core(virtual_core); } catch (std::runtime_error& e) { // We may not be able to get a core type if the physical coords are bad. return {"Unknown", ""}; @@ -304,13 +300,13 @@ void WatcherDeviceReader::DumpCore(CoreDescriptor& logical_core, bool is_active_ // Watcher only treats ethernet + worker cores. bool is_eth_core = (logical_core.type == CoreType::ETH); CoreDescriptor core; - core.coord = device->physical_core_from_logical_core(logical_core.coord, logical_core.type); + core.coord = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); core.type = logical_core.type; // Print device id, core coords (logical) string core_type = is_eth_core ? "ethnet" : "worker"; string core_str = fmt::format( - "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2})", + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2})", device->id(), core_type, logical_core.coord.x, @@ -476,6 +472,10 @@ void WatcherDeviceReader::DumpNocSanitizeStatus( error_msg = get_noc_target_str(device, core, noc, san); error_msg += " (invalid address alignment in NOC transaction)."; break; + case DebugSanitizeNocMixedVirtualandPhysical: + error_msg = get_noc_target_str(device, core, noc, san); + error_msg += " (mixing virtual and physical coordinates in Mcast)."; + break; default: error_msg = fmt::format( "Watcher unexpected data corruption, noc debug state on core {}, unknown failure code: {}", diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp index 8b92836a1fbd..9b81e7d13b01 100644 --- a/tt_metal/impl/debug/watcher_server.cpp +++ b/tt_metal/impl/debug/watcher_server.cpp @@ -283,7 +283,7 @@ void watcher_init(Device* device) { CoreCoord phys_core; bool valid_logical_core = true; try { - phys_core = device->physical_core_from_logical_core(logical_core, core_type); + phys_core = device->virtual_core_from_logical_core(logical_core, core_type); } catch (std::runtime_error& error) { valid_logical_core = false; } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 2f8196d7c05e..f55271e3679b 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -5,7 +5,7 @@ #include #include #include "tt_metal/device.hpp" -#include "common/core_coord.hpp" +#include "common/core_assignment.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/trace/trace.hpp" @@ -70,21 +70,12 @@ uint32_t Device::num_worker_cores(HalProgrammableCoreType core_type, SubDeviceId return this->active_sub_device_manager_->sub_device(sub_device_id).num_cores(core_type); } -std::vector Device::get_noc_encoding_for_active_eth_cores(NOC noc_index) { - auto active_ethernet_cores = this->get_active_ethernet_cores(true); - std::vector noc_encodings = {}; - noc_encodings.reserve(active_ethernet_cores.size()); - for (const auto& core : active_ethernet_cores) { - noc_encodings.push_back(this->get_noc_unicast_encoding(noc_index, ethernet_core_from_logical_core(core))); - } - return noc_encodings; -} /* Get all dispatch cores associated with this device. On return, my_dispatch_cores contains dispatch cores used by * this device (split between cores on this device itself and if this is a remote device, the mmio device dispatch * cores being used by this device). On return, other_dispatch_cores contains dispatch cores on this device that are * used by other (remote) devices. */ -void Device::get_associated_dispatch_phys_cores( +void Device::get_associated_dispatch_virtual_cores( std::unordered_map> &my_dispatch_cores, std::unordered_map> &other_dispatch_cores) { if (this->is_mmio_capable()) { @@ -96,54 +87,54 @@ void Device::get_associated_dispatch_phys_cores( if (device_id == this->id_) { //mmio device. bool dispatch_hd_allocated = false; - CoreCoord phys_core_dispatch_hd; + CoreCoord virtual_core_dispatch_hd; if (dispatch_core_manager::instance().is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, curr_channel, cq_id); - phys_core_dispatch_hd = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - my_dispatch_cores[this->id_].insert(phys_core_dispatch_hd); + virtual_core_dispatch_hd = this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + my_dispatch_cores[this->id_].insert(virtual_core_dispatch_hd); dispatch_hd_allocated = true; - log_debug(tt::LogMetal, "MMIO Device Dispatch core: Logical: {} - Physical: {}", dispatch_location.str(), phys_core_dispatch_hd.str()); + log_debug(tt::LogMetal, "MMIO Device Dispatch core: Logical: {} - Physical: {}", dispatch_location.str(), virtual_core_dispatch_hd.str()); } // Include dispatch_s in the dispatch core location set, if its not on the same core as dispatch_hd if (dispatch_core_manager::instance().is_dispatcher_s_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair dispatch_s_location = dispatch_core_manager::instance().dispatcher_s_core(device_id, curr_channel, cq_id); - CoreCoord phys_core_dispatch_s = get_physical_core_coordinate(dispatch_s_location, dispatch_core_type); - if ((!dispatch_hd_allocated) or (phys_core_dispatch_s != phys_core_dispatch_hd)) { - my_dispatch_cores[dispatch_s_location.chip].insert(phys_core_dispatch_s); + CoreCoord virtual_core_dispatch_s = this->virtual_core_from_logical_core(dispatch_s_location, dispatch_core_type); + if ((!dispatch_hd_allocated) or (virtual_core_dispatch_s != virtual_core_dispatch_hd)) { + my_dispatch_cores[dispatch_s_location.chip].insert(virtual_core_dispatch_s); } } if (dispatch_core_manager::instance().is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - my_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "MMIO Device Prefetch core: Logical: {} - Physical: {}", prefetch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type); + my_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "MMIO Device Prefetch core: Logical: {} - Physical: {}", prefetch_location.str(), virtual_core.str()); } } else if (tt::DevicePool::instance().is_device_active(device_id)) { //non mmio devices serviced by this mmio capable device. //skip remote dispatch cores only if respective remote device is active. if (dispatch_core_manager::instance().is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - other_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will keep running on MMIO Device.", dispatch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will keep running on MMIO Device.", dispatch_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - other_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will keep running on MMIO Device.", prefetch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will keep running on MMIO Device.", prefetch_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_mux_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); - other_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will keep running on MMIO Device.", mux_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(mux_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will keep running on MMIO Device.", mux_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_demux_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); - other_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will keep running on MMIO Device.", demux_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(demux_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will keep running on MMIO Device.", demux_location.str(), virtual_core.str()); } } } @@ -157,47 +148,47 @@ void Device::get_associated_dispatch_phys_cores( for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { if (dispatch_core_manager::instance().is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will be reset on MMIO Device.", dispatch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will be reset on MMIO Device.", dispatch_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - my_dispatch_cores[prefetch_location.chip].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will be reset on MMIO Device.", prefetch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type); + my_dispatch_cores[prefetch_location.chip].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will be reset on MMIO Device.", prefetch_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_mux_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); - my_dispatch_cores[mux_location.chip].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will be reset on MMIO Device.", mux_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(mux_location, dispatch_core_type); + my_dispatch_cores[mux_location.chip].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will be reset on MMIO Device.", mux_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_demux_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); - my_dispatch_cores[demux_location.chip].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will be reset on MMIO Device.", demux_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(demux_location, dispatch_core_type); + my_dispatch_cores[demux_location.chip].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will be reset on MMIO Device.", demux_location.str(), virtual_core.str()); } - CoreCoord phys_core; + CoreCoord virtual_core; tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_d_core(device_id, curr_channel, cq_id); - phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); + virtual_core = this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); // Include dispatch_s in the dispatch core location set, if its not on the same core as dispatch_d tt_cxy_pair dispatch_s_location = dispatch_core_manager::instance().dispatcher_s_core(device_id, curr_channel, cq_id); - CoreCoord phys_core_dispatch_s = get_physical_core_coordinate(dispatch_s_location, dispatch_core_type); - if (phys_core_dispatch_s != phys_core) { - my_dispatch_cores[dispatch_s_location.chip].insert(phys_core_dispatch_s); + CoreCoord virtual_core_dispatch_s = this->virtual_core_from_logical_core(dispatch_s_location, dispatch_core_type); + if (virtual_core_dispatch_s != virtual_core) { + my_dispatch_cores[dispatch_s_location.chip].insert(virtual_core_dispatch_s); } tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_d_core(device_id, curr_channel, cq_id); - phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); + virtual_core = this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_d_core(device_id, curr_channel, cq_id); - phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); + virtual_core = this->virtual_core_from_logical_core(mux_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_d_core(device_id, curr_channel, cq_id); - phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); + virtual_core = this->virtual_core_from_logical_core(demux_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); } } } @@ -248,8 +239,8 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si .l1_small_size = align(l1_small_size, hal.get_alignment(HalMemType::L1)), .trace_region_size = align(trace_region_size, hal.get_alignment(HalMemType::DRAM)), .core_type_from_noc_coord_table = {}, // Populated later - .worker_log_to_physical_routing_x = soc_desc.worker_log_to_physical_routing_x, - .worker_log_to_physical_routing_y = soc_desc.worker_log_to_physical_routing_y, + .worker_log_to_virtual_routing_x = tt::Cluster::instance().get_worker_logical_to_virtual_x(this->id()), + .worker_log_to_virtual_routing_y = tt::Cluster::instance().get_worker_logical_to_virtual_y(this->id()), .l1_bank_remap = {l1_bank_remap.begin(), l1_bank_remap.end()}, .compute_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(compute_size.x - 1, compute_size.y - 1))), .alignment = std::max(hal.get_alignment(HalMemType::DRAM), hal.get_alignment(HalMemType::L1)), @@ -266,7 +257,7 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si } // Initialize core_type_from_noc_coord_table table for (const auto& core: soc_desc.physical_cores) { - config.core_type_from_noc_coord_table.insert({core.first, AllocCoreType::Invalid}); + config.core_type_from_noc_coord_table.insert({this->virtual_core_from_physical_core(core.first, core.second.type), AllocCoreType::Invalid}); } for (const CoreCoord& core : tt::get_logical_compute_cores(id_, num_hw_cqs_, dispatch_core_config)) { @@ -280,7 +271,7 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si config.core_type_from_noc_coord_table[noc_coord] = AllocCoreType::StorageOnly; } for (const CoreCoord &core : tt::get_logical_dispatch_cores(id_, num_hw_cqs_, dispatch_core_config)) { - const auto noc_coord = this->physical_core_from_logical_core(core, dispatch_core_type); + const auto noc_coord = this->virtual_core_from_logical_core(core, dispatch_core_type); config.core_type_from_noc_coord_table[noc_coord] = AllocCoreType::Dispatch; } for (const auto &core : soc_desc.get_logical_ethernet_cores()) { @@ -410,7 +401,7 @@ void Device::build_firmware() { jit_build_set(this->firmware_build_states_, nullptr); } -void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord phys_core) +void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core) { const uint32_t dram_to_noc_sz_in_bytes = dram_bank_to_noc_xy_.size() * sizeof(uint16_t); const uint32_t l1_to_noc_sz_in_bytes = l1_bank_to_noc_xy_.size() * sizeof(uint16_t); @@ -423,26 +414,26 @@ void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType TT_ASSERT((dram_to_noc_sz_in_bytes + l1_to_noc_sz_in_bytes + dram_offset_sz_in_bytes + l1_offset_sz_in_bytes) <= mem_bank_to_noc_size, "Size of bank_to_noc table is greater than available space"); - tt::Cluster::instance().write_core(&dram_bank_to_noc_xy_[0], dram_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), mem_bank_to_noc_addr); + tt::Cluster::instance().write_core(&dram_bank_to_noc_xy_[0], dram_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), virtual_core), mem_bank_to_noc_addr); uint64_t l1_noc_addr = mem_bank_to_noc_addr + dram_to_noc_sz_in_bytes; - tt::Cluster::instance().write_core(&l1_bank_to_noc_xy_[0], l1_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), l1_noc_addr); + tt::Cluster::instance().write_core(&l1_bank_to_noc_xy_[0], l1_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), virtual_core), l1_noc_addr); uint64_t dram_offset_addr = l1_noc_addr + l1_to_noc_sz_in_bytes; - tt::Cluster::instance().write_core(&dram_bank_offset_map_[0], dram_offset_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), dram_offset_addr); + tt::Cluster::instance().write_core(&dram_bank_offset_map_[0], dram_offset_sz_in_bytes, tt_cxy_pair(this->id(), virtual_core), dram_offset_addr); uint64_t l1_offset_addr = dram_offset_addr + dram_offset_sz_in_bytes; - tt::Cluster::instance().write_core(&l1_bank_offset_map_[0], l1_offset_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), l1_offset_addr); + tt::Cluster::instance().write_core(&l1_bank_offset_map_[0], l1_offset_sz_in_bytes, tt_cxy_pair(this->id(), virtual_core), l1_offset_addr); } -void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg) { +void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg) { ZoneScoped; - this->initialize_device_bank_to_noc_tables(core_type, phys_core); + this->initialize_device_bank_to_noc_tables(core_type, virtual_core); uint32_t core_type_idx = hal.get_programmable_core_type_index(core_type); uint32_t processor_class_count = hal.get_processor_classes_count(core_type); switch (core_type) { case HalProgrammableCoreType::TENSIX: { - llrt::program_risc_startup_addr(this->id(), phys_core); + llrt::program_risc_startup_addr(this->id(), virtual_core); for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); for (uint32_t riscv_id = build_idx; riscv_id < (build_idx + num_build_states); riscv_id++) { @@ -458,7 +449,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC } log_debug(LogDevice, "RISC {} fw binary size: {} in bytes", riscv_id, fw_size); if (not llrt::OptionsG.get_skip_loading_fw()) { - llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, core_type_idx, processor_class, (riscv_id - build_idx)); + llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (riscv_id - build_idx)); } } } @@ -471,7 +462,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC if (dispatch_core_manager::instance().get_dispatch_core_type(this->id()) == CoreType::WORKER) { physical_dispatch_cores = this->worker_cores_from_logical_cores(dispatch_core_manager::instance().get_all_logical_dispatch_cores(this->id())); } - if (std::find(physical_dispatch_cores.begin(), physical_dispatch_cores.end(), phys_core) != physical_dispatch_cores.end()) { + if (std::find(physical_dispatch_cores.begin(), physical_dispatch_cores.end(), virtual_core) != physical_dispatch_cores.end()) { // Dispatch cores - Host writes launch messages launch_msg->kernel_config.mode = DISPATCH_MODE_HOST; } else { @@ -486,7 +477,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC case HalProgrammableCoreType::IDLE_ETH: { bool is_idle_eth = core_type == HalProgrammableCoreType::IDLE_ETH; if (is_idle_eth) { - tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), phys_core)); + tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), virtual_core)); } if (not llrt::OptionsG.get_skip_loading_fw()) { for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { @@ -499,14 +490,14 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC (eriscv_id - build_idx)); uint32_t fw_size = binary_mem.get_text_size(); log_debug(LogDevice, "ERISC fw binary size: {} in bytes", fw_size); - llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, core_type_idx, processor_class, (eriscv_id - build_idx)); + llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (eriscv_id - build_idx)); } } } if (is_idle_eth) { - llrt::program_risc_startup_addr(this->id(), phys_core); + llrt::program_risc_startup_addr(this->id(), virtual_core); } else { - llrt::launch_erisc_app_fw_on_core(this->id(), phys_core); + llrt::launch_erisc_app_fw_on_core(this->id(), virtual_core); } // Ethernet worker core. Launch messages will be sent by FD infra if it's enabled // Idle ethernet core. Used by FD infra. Host will write launch messages during init. @@ -528,12 +519,12 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC // worker cores (Tensix and active eth) configured with DISPATCH_MODE_DEV // When using Slow Dispatch, all cores initialized with DISPATCH_MODE_HOST std::vector init_launch_msg_data(launch_msg_buffer_num_entries, *launch_msg); - tt::Cluster::instance().write_core(init_launch_msg_data.data(), launch_msg_buffer_num_entries * sizeof(launch_msg_t), tt_cxy_pair(this->id(), phys_core), this->get_dev_addr(phys_core, HalL1MemAddrType::LAUNCH)); - uint32_t go_addr = this->get_dev_addr(phys_core, HalL1MemAddrType::GO_MSG); - tt::Cluster::instance().write_core(go_msg, sizeof(go_msg_t), tt_cxy_pair(this->id(), phys_core), go_addr); - uint64_t launch_msg_buffer_read_ptr_addr = this->get_dev_addr(phys_core, HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR); + tt::Cluster::instance().write_core(init_launch_msg_data.data(), launch_msg_buffer_num_entries * sizeof(launch_msg_t), tt_cxy_pair(this->id(), virtual_core), this->get_dev_addr(virtual_core, HalL1MemAddrType::LAUNCH)); + uint32_t go_addr = this->get_dev_addr(virtual_core, HalL1MemAddrType::GO_MSG); + tt::Cluster::instance().write_core(go_msg, sizeof(go_msg_t), tt_cxy_pair(this->id(), virtual_core), go_addr); + uint64_t launch_msg_buffer_read_ptr_addr = this->get_dev_addr(virtual_core, HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR); uint32_t zero = 0; - tt::Cluster::instance().write_core(&zero, sizeof(uint32_t), tt_cxy_pair(this->id(), phys_core), launch_msg_buffer_read_ptr_addr); + tt::Cluster::instance().write_core(&zero, sizeof(uint32_t), tt_cxy_pair(this->id(), virtual_core), launch_msg_buffer_read_ptr_addr); } void Device::reset_cores() { @@ -549,16 +540,16 @@ void Device::reset_cores() { go_msg_t go_msg; std::memset(&go_msg, 0, sizeof(go_msg_t)); for (const auto ð_core : this->get_active_ethernet_cores()) { - CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); + CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); std::vector data(sizeof(launch_msg_t) / sizeof(uint32_t)); std::vector go_signal_data(sizeof(go_msg_t) / sizeof(uint32_t)); DeviceAddr launch_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH); DeviceAddr go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::GO_MSG); data = tt::llrt::read_hex_vec_from_core( - this->id(), physical_core, launch_addr, sizeof(launch_msg_t)); + this->id(), virtual_core, launch_addr, sizeof(launch_msg_t)); go_signal_data = tt::llrt::read_hex_vec_from_core( - this->id(), physical_core, go_signal_addr, sizeof(go_msg_t)); + this->id(), virtual_core, go_signal_addr, sizeof(go_msg_t)); launch_msg_t *launch_msg = (launch_msg_t *)(&data[0]); go_msg_t * go_signal = (go_msg_t *)(&go_signal_data[0]); if (kernel_still_running(launch_msg, go_signal)) { @@ -566,30 +557,30 @@ void Device::reset_cores() { tt::LogMetal, "While initializing Device {}, ethernet tunneler core {} on Device {} detected as still running, issuing exit signal.", this->id(), - physical_core.str(), + virtual_core.str(), this->id()); launch_msg->kernel_config.exit_erisc_kernel = 1; - llrt::write_launch_msg_to_core(this->id(), physical_core, launch_msg, &go_msg, launch_addr, false); - device_to_early_exit_cores[this->id()].insert(physical_core); + llrt::write_launch_msg_to_core(this->id(), virtual_core, launch_msg, &go_msg, launch_addr, false); + device_to_early_exit_cores[this->id()].insert(virtual_core); } } - this->get_associated_dispatch_phys_cores(dispatch_cores, other_dispatch_cores); + this->get_associated_dispatch_virtual_cores(dispatch_cores, other_dispatch_cores); // Ignore other_dispatch_cores, they will be reset by the devices that use them. for (auto &id_and_cores : dispatch_cores) { for (auto it = id_and_cores.second.begin(); it != id_and_cores.second.end(); it++) { - const auto &phys_core = *it; + const auto &virtual_core = *it; // Only need to manually reset ethernet dispatch cores, tensix cores are all reset below. - if (llrt::is_ethernet_core(phys_core, id_and_cores.first)) { + if (tt::Cluster::instance().is_ethernet_core(virtual_core, id_and_cores.first)) { // Ethernet cores won't be reset, so just signal the dispatch cores to early exit. std::vector data(sizeof(launch_msg_t) / sizeof(uint32_t)); std::vector go_signal_data(sizeof(go_msg_t) / sizeof(uint32_t)); DeviceAddr launch_addr = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::LAUNCH); DeviceAddr go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::GO_MSG); data = tt::llrt::read_hex_vec_from_core( - id_and_cores.first, phys_core, launch_addr, sizeof(launch_msg_t)); + id_and_cores.first, virtual_core, launch_addr, sizeof(launch_msg_t)); go_signal_data = tt::llrt::read_hex_vec_from_core( - this->id(), phys_core, go_signal_addr, sizeof(go_msg_t)); + this->id(), virtual_core, go_signal_addr, sizeof(go_msg_t)); launch_msg_t *launch_msg = (launch_msg_t *)(&data[0]); go_msg_t * go_signal = (go_msg_t *)(&go_signal_data[0]); if (kernel_still_running(launch_msg, go_signal)) { @@ -597,11 +588,11 @@ void Device::reset_cores() { tt::LogMetal, "While initializing device {}, ethernet dispatch core {} on Device {} detected as still running, issuing exit signal.", this->id(), - phys_core.str(), + virtual_core.str(), id_and_cores.first); launch_msg->kernel_config.exit_erisc_kernel = 1; - llrt::write_launch_msg_to_core(id_and_cores.first, phys_core, launch_msg, &go_msg, launch_addr, false); - device_to_early_exit_cores[id_and_cores.first].insert(phys_core); + llrt::write_launch_msg_to_core(id_and_cores.first, virtual_core, launch_msg, &go_msg, launch_addr, false); + device_to_early_exit_cores[id_and_cores.first].insert(virtual_core); } } } @@ -665,22 +656,40 @@ void Device::initialize_and_launch_firmware() { const std::vector &pcie_cores = soc_d.get_pcie_cores(); const std::vector &dram_cores = soc_d.get_dram_cores(); const std::vector ð_cores = soc_d.get_physical_ethernet_cores(); + // The SOC descriptor can list a dram core multiple times, depending on how GDDR is assigned to banks + // Get a list of unique DRAM cores. + std::unordered_set unique_dram_cores(dram_cores.begin(), dram_cores.end()); TT_ASSERT( - pcie_cores.size() + dram_cores.size() + eth_cores.size() <= MAX_NON_WORKER_CORES, + pcie_cores.size() + unique_dram_cores.size() + eth_cores.size() <= MAX_NON_WORKER_CORES, "Detected more pcie/dram/eth cores than fit in the device mailbox."); + TT_ASSERT( + eth_cores.size() <= MAX_VIRTUAL_NON_WORKER_CORES, + "Detected more eth cores (virtual non-workers) than can fit in device mailbox."); for (int idx = 0; idx < MAX_NON_WORKER_CORES; idx++) { core_info->non_worker_cores[idx] = {CORE_COORD_INVALID, CORE_COORD_INVALID, AddressableCoreType::UNKNOWN}; } + for (int idx = 0; idx < MAX_VIRTUAL_NON_WORKER_CORES; idx++) { + core_info->virtual_non_worker_cores[idx] = {CORE_COORD_INVALID, CORE_COORD_INVALID, AddressableCoreType::UNKNOWN}; + } + int non_worker_cores_idx = 0; for (const CoreCoord &core : pcie_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::PCIE}; } - for (const CoreCoord &core : dram_cores) { + for (const CoreCoord &core : unique_dram_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::DRAM}; } for (const CoreCoord &core : eth_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::ETH}; } + if (hal.is_coordinate_virtualization_enabled()) { + // Track Virtual Non Worker Cores (In this case only Eth) separately + uint32_t virtual_non_worker_cores_idx = 0; + for (const CoreCoord &core : eth_cores) { + auto virtual_core = this->virtual_core_from_physical_core(core, CoreType::ETH); + core_info->virtual_non_worker_cores[virtual_non_worker_cores_idx++] = {virtual_core.x, virtual_core.y, AddressableCoreType::ETH}; + } + } // Determine which noc-coords are harvested // TODO(PGK/Almeet): fix this w/ new UMD @@ -695,6 +704,13 @@ void Device::initialize_and_launch_firmware() { TT_ASSERT(harvested_rows.size() <= MAX_HARVESTED_ROWS, "Detected more harvested rows than fit in mailbox."); for (int idx = 0; idx < MAX_HARVESTED_ROWS; idx++) { core_info->harvested_y[idx] = (idx < harvested_rows.size()) ? harvested_rows[idx] : CORE_COORD_INVALID; + // Populate harvested rows in virtual coordinate space if virtualization is supported by HW. + // Harvested rows in the virtual space are placed at the end of the worker grid, + if (hal.is_coordinate_virtualization_enabled() and idx < harvested_rows.size()) { + core_info->virtual_harvested_y[idx] = (hal.get_virtual_worker_start_y() + this->logical_grid_size().y + harvested_rows.size() - (idx + 1)); + } else { + core_info->virtual_harvested_y[idx] = CORE_COORD_INVALID; + } } core_info->noc_size_x = soc_d.grid_size.x; @@ -721,10 +737,10 @@ void Device::initialize_and_launch_firmware() { // Clear erisc sync info std::vector zero_vec_erisc_init(eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_SIZE / sizeof(uint32_t), 0); for (const auto ð_core : this->get_active_ethernet_cores()) { - CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); + CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); llrt::write_hex_vec_to_core( - this->id(), physical_core, zero_vec_erisc_init, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); + this->id(), virtual_core, zero_vec_erisc_init, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); } // Load erisc app base FW to eth cores @@ -784,11 +800,11 @@ void Device::clear_l1_state() { // Clear erisc sync info for (const auto ð_core : this->get_active_ethernet_cores()) { - CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); + CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); llrt::write_hex_vec_to_core( this->id(), - physical_core, + virtual_core, zero_vec_above_tile_header_buffer, eth_l1_mem::address_map::TILE_HEADER_BUFFER_BASE); @@ -801,11 +817,11 @@ void Device::configure_kernel_variant( const string& path, const std::vector& compile_args, CoreCoord kernel_core, - CoreCoord kernel_physical_core, + CoreCoord kernel_virtual_core, CoreType dispatch_core_type, - CoreCoord upstream_physical_core, - CoreCoord downstream_physical_core, - CoreCoord downstream_slave_physical_core, + CoreCoord upstream_virtual_core, + CoreCoord downstream_virtual_core, + CoreCoord downstream_slave_virtual_core, std::map defines_in, NOC my_noc_index, NOC upstream_noc_index, @@ -814,25 +830,28 @@ void Device::configure_kernel_variant( bool send_to_brisc, bool force_watcher_no_inline) { - const auto& grid_size = this->grid_size(); - // TODO: just pass in the programmable index uint32_t programmable_core_type_index = (dispatch_core_type == CoreType::WORKER) ? hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX) : is_active_eth_core ? hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) : hal.get_programmable_core_type_index(HalProgrammableCoreType::IDLE_ETH); + auto my_virtual_noc_coords = this->virtual_noc_coordinate(my_noc_index, kernel_virtual_core); + auto upstream_virtual_noc_coords = this->virtual_noc_coordinate(upstream_noc_index, upstream_virtual_core); + auto downstream_virtual_noc_coords = this->virtual_noc_coordinate(downstream_noc_index, downstream_virtual_core); + auto downstream_slave_virtual_noc_coords = this->virtual_noc_coordinate(downstream_noc_index, downstream_slave_virtual_core); + std::map defines = { {"DISPATCH_KERNEL", "1"}, - {"MY_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(my_noc_index, grid_size.x, kernel_physical_core.x))}, - {"MY_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(my_noc_index, grid_size.y, kernel_physical_core.y))}, + {"MY_NOC_X", std::to_string(my_virtual_noc_coords.x)}, + {"MY_NOC_Y", std::to_string(my_virtual_noc_coords.y)}, {"UPSTREAM_NOC_INDEX", std::to_string(upstream_noc_index)}, - {"UPSTREAM_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(upstream_noc_index, grid_size.x, upstream_physical_core.x))}, - {"UPSTREAM_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(upstream_noc_index, grid_size.y, upstream_physical_core.y))}, - {"DOWNSTREAM_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.x, downstream_physical_core.x))}, - {"DOWNSTREAM_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.y, downstream_physical_core.y))}, - {"DOWNSTREAM_SLAVE_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.x, downstream_slave_physical_core.x))}, - {"DOWNSTREAM_SLAVE_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.y, downstream_slave_physical_core.y))}, + {"UPSTREAM_NOC_X", std::to_string(upstream_virtual_noc_coords.x)}, + {"UPSTREAM_NOC_Y", std::to_string(upstream_virtual_noc_coords.y)}, + {"DOWNSTREAM_NOC_X", std::to_string(downstream_virtual_noc_coords.x)}, + {"DOWNSTREAM_NOC_Y", std::to_string(downstream_virtual_noc_coords.y)}, + {"DOWNSTREAM_SLAVE_NOC_X", std::to_string(downstream_slave_virtual_noc_coords.x)}, + {"DOWNSTREAM_SLAVE_NOC_Y", std::to_string(downstream_slave_virtual_noc_coords.y)}, {"FD_CORE_TYPE", std::to_string(programmable_core_type_index)}, }; if (force_watcher_no_inline) { @@ -896,7 +915,7 @@ void Device::update_workers_build_settings(std::vector> 4; @@ -971,8 +990,8 @@ void Device::update_workers_build_settings(std::vector> 4; // 3: rx_queue_size_words for (uint32_t i = 0; i < fwd_vc_count; i++) { - compile_args[4 + i] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x, - tunneler_settings.eth_partner_physical_core.y, + compile_args[4 + i] = packet_switch_4B_pack(tunneler_settings.eth_partner_virtual_core.x, + tunneler_settings.eth_partner_virtual_core.y, i, (uint32_t)DispatchRemoteNetworkType::ETH); // 4 - 13: remote_receiver fwd vcs @@ -1031,8 +1050,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DEMUX][0]); - compile_args[4 + return_vc] = packet_switch_4B_pack(demux_settings.worker_physical_core.x, - demux_settings.worker_physical_core.y, + compile_args[4 + return_vc] = packet_switch_4B_pack(demux_settings.worker_virtual_core.x, + demux_settings.worker_virtual_core.y, 0,//demux input, (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver return vc compile_args[14 + return_vc * 2] = demux_settings.cb_start_address >> 4; // 8: remote_receiver_queue_start_addr_words return vc @@ -1042,8 +1061,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::MUX_D][0]); uint32_t prefetch_d_count = device_worker_variants[DispatchWorkerType::PREFETCH_D].size(); - compile_args[4 + return_vc] = packet_switch_4B_pack(mux_d_settings.worker_physical_core.x, - mux_d_settings.worker_physical_core.y, + compile_args[4 + return_vc] = packet_switch_4B_pack(mux_d_settings.worker_virtual_core.x, + mux_d_settings.worker_virtual_core.y, mux_d_settings.semaphores.size(),//mux_d input. This is return path from next tunnel stop towards mmio device. //mux_d iput 0 is driven by local Dispatch D (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver return vc @@ -1066,8 +1085,8 @@ void Device::update_workers_build_settings(std::vector demux input, 1=> demux_d output to local Prefetch D, 2=> demux_d output to tunneler (to next tunnel stop) (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender fwd vcs } @@ -1080,8 +1099,8 @@ void Device::update_workers_build_settings(std::vector> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x compile_args[arg_index++] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x } - compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x - compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y + compile_args[16] = tunneler_settings.worker_virtual_core.x; // 16: remote_rx_x + compile_args[17] = tunneler_settings.worker_virtual_core.y; // 17: remote_rx_y compile_args[18] = tunneler_settings.vc_count * 2 - 1; // 18: remote_rx_queue_id compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type uint32_t dest_map_array[4] = {0, 1, 2, 3}; @@ -1153,12 +1172,12 @@ void Device::update_workers_build_settings(std::vector> 4; // 2: rx_queue_size_words compile_args[3] = 2; // 3: demux_fan_out - compile_args[4] = packet_switch_4B_pack((uint32_t)demux_1_settings.worker_physical_core.x, - (uint32_t)demux_1_settings.worker_physical_core.y, + compile_args[4] = packet_switch_4B_pack((uint32_t)demux_1_settings.worker_virtual_core.x, + (uint32_t)demux_1_settings.worker_virtual_core.y, 0, (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info - compile_args[5] = packet_switch_4B_pack((uint32_t)demux_2_settings.worker_physical_core.x, - (uint32_t)demux_2_settings.worker_physical_core.y, + compile_args[5] = packet_switch_4B_pack((uint32_t)demux_2_settings.worker_virtual_core.x, + (uint32_t)demux_2_settings.worker_virtual_core.y, 0, (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info @@ -1167,8 +1186,8 @@ void Device::update_workers_build_settings(std::vector> 4; // 10: remote_tx_queue_start_addr_words x compile_args[11] = demux_2_settings.cb_size_bytes >> 4; // 11: remote_tx_queue_size_words x - compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x - compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y + compile_args[16] = tunneler_settings.worker_virtual_core.x; // 16: remote_rx_x + compile_args[17] = tunneler_settings.worker_virtual_core.y; // 17: remote_rx_y compile_args[18] = tunneler_settings.vc_count * 2 - 1; // 18: remote_rx_queue_id compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type @@ -1194,16 +1213,16 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DISPATCH][i]); - demux_1_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, - (uint32_t)settings.worker_physical_core.y, + demux_1_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_virtual_core.x, + (uint32_t)settings.worker_virtual_core.y, 0, (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info demux_1_compile_args[8 + i * 2] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x demux_1_compile_args[9 + i * 2] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x } - demux_1_compile_args[16] = demux_settings.worker_physical_core.x; // 16: remote_rx_x - demux_1_compile_args[17] = demux_settings.worker_physical_core.y; // 17: remote_rx_y + demux_1_compile_args[16] = demux_settings.worker_virtual_core.x; // 16: remote_rx_x + demux_1_compile_args[17] = demux_settings.worker_virtual_core.y; // 17: remote_rx_y demux_1_compile_args[18] = 1; // 18: remote_rx_queue_id demux_1_compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type uint32_t dest_map_array[4] = {0, 1, 2, 3}; @@ -1236,16 +1255,16 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DISPATCH][i + demux_1_fanout]); - demux_2_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, - (uint32_t)settings.worker_physical_core.y, + demux_2_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_virtual_core.x, + (uint32_t)settings.worker_virtual_core.y, 0, (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info demux_2_compile_args[8 + i * 2] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x demux_2_compile_args[9 + i * 2] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x } - demux_2_compile_args[16] = demux_settings.worker_physical_core.x; // 16: remote_rx_x - demux_2_compile_args[17] = demux_settings.worker_physical_core.y; // 17: remote_rx_y + demux_2_compile_args[16] = demux_settings.worker_virtual_core.x; // 16: remote_rx_x + demux_2_compile_args[17] = demux_settings.worker_virtual_core.y; // 17: remote_rx_y demux_2_compile_args[18] = 2; // 18: remote_rx_queue_id demux_2_compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); @@ -1282,12 +1301,12 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::PREFETCH][dispatch_idx]); - auto prefetch_physical_core = prefetch_h_settings.worker_physical_core; + auto prefetch_virtual_core = prefetch_h_settings.worker_virtual_core; auto dispatch_core_type = settings.dispatch_core_type; uint32_t host_completion_queue_wr_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_wr_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_rd_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); - settings.upstream_cores.push_back(demux_settings.worker_physical_core); + settings.upstream_cores.push_back(demux_settings.worker_virtual_core); settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0)); settings.compile_args.resize(31); auto& compile_args = settings.compile_args; @@ -1307,7 +1326,7 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DISPATCH][dispatch_idx]); auto prefetch_h_settings = std::get<1>(device_worker_variants[DispatchWorkerType::PREFETCH][dispatch_idx]); - auto prefetch_physical_core = prefetch_h_settings.worker_physical_core; + auto prefetch_virtual_core = prefetch_h_settings.worker_virtual_core; auto dispatch_core_type = settings.dispatch_core_type; uint32_t host_completion_queue_wr_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_wr_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_rd_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); - settings.upstream_cores.push_back(demux_settings.worker_physical_core); + settings.upstream_cores.push_back(demux_settings.worker_virtual_core); settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0)); settings.compile_args.resize(31); auto& compile_args = settings.compile_args; @@ -1362,7 +1381,7 @@ void Device::update_workers_build_settings(std::vector 1) { auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); - auto mux_d_sender = us_tunneler_remote_settings.worker_physical_core; + auto mux_d_sender = us_tunneler_remote_settings.worker_virtual_core; compile_args[47] = (return_vc << 24) | ((us_tunneler_remote_settings.vc_count * 2 - 1) << 16) | (mux_d_sender.y << 8) | (mux_d_sender.x); - log_debug(tt::LogMetal, "Tunnelr Inner Device {} will send done to {}", tunneler_settings.worker_physical_core.str(), mux_d_sender.str()); + log_debug(tt::LogMetal, "Tunnelr Inner Device {} will send done to {}", tunneler_settings.worker_virtual_core.str(), mux_d_sender.str()); } break; @@ -1497,8 +1516,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::PREFETCH_D][p + prefetch_d_connected]); - compile_args[4 + demux_output_idx] = packet_switch_4B_pack(prefetch_d_setting.worker_physical_core.x, - prefetch_d_setting.worker_physical_core.y, + compile_args[4 + demux_output_idx] = packet_switch_4B_pack(prefetch_d_setting.worker_virtual_core.x, + prefetch_d_setting.worker_virtual_core.y, 0, // prefetch_d input queue id (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_tx_0_info compile_args[8 + demux_output_cb_info_idx] = prefetch_d_setting.cb_start_address >> 4; @@ -1511,8 +1530,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); for (int i = 0; i < vcs_per_demux_d; i++) { - compile_args[4 + demux_output_idx + i] = packet_switch_4B_pack((uint32_t)us_tunneler_remote_settings.worker_physical_core.x, - (uint32_t)us_tunneler_remote_settings.worker_physical_core.y, + compile_args[4 + demux_output_idx + i] = packet_switch_4B_pack((uint32_t)us_tunneler_remote_settings.worker_virtual_core.x, + (uint32_t)us_tunneler_remote_settings.worker_virtual_core.y, remote_tunneler_vcs_connected, (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_tx_1_info compile_args[8 + (demux_output_idx + i) * 2] = (us_tunneler_remote_settings.cb_start_address + remote_tunneler_vcs_connected * us_tunneler_remote_settings.cb_size_bytes) >> 4; // 10: remote_tx_queue_start_addr_words 1 @@ -1527,8 +1546,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DISPATCH_D][prefetch_d_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_d auto dispatch_s_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH_S][prefetch_d_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_s auto dispatch_core_type = prefetch_d_settings.dispatch_core_type; - prefetch_d_settings.upstream_cores.push_back(demux_d_settings.worker_physical_core); - prefetch_d_settings.downstream_cores.push_back(dispatch_d_settings.worker_physical_core); - prefetch_d_settings.downstream_cores.push_back(dispatch_s_settings.worker_physical_core); + prefetch_d_settings.upstream_cores.push_back(demux_d_settings.worker_virtual_core); + prefetch_d_settings.downstream_cores.push_back(dispatch_d_settings.worker_virtual_core); + prefetch_d_settings.downstream_cores.push_back(dispatch_s_settings.worker_virtual_core); uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); uint32_t scratch_db_base = (prefetch_d_settings.cb_start_address + prefetch_d_settings.cb_size_bytes + pcie_alignment - 1) & (~(pcie_alignment - 1)); uint32_t scratch_db_size = dispatch_constants::get(dispatch_core_type).scratch_db_size(); @@ -1668,9 +1687,9 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::PREFETCH_D][dispatch_s_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_s auto dispatch_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH_D][dispatch_s_idx]); // 1 to 1 mapping bw dispatch_d and dispatch_s - dispatch_s_settings.upstream_cores.push_back(prefetch_d_settings.worker_physical_core); - dispatch_s_settings.downstream_cores.push_back(dispatch_d_settings.worker_physical_core); + dispatch_s_settings.upstream_cores.push_back(prefetch_d_settings.worker_virtual_core); + dispatch_s_settings.downstream_cores.push_back(dispatch_d_settings.worker_virtual_core); auto dispatch_core_type = dispatch_s_settings.dispatch_core_type; uint32_t dispatch_message_base_addr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); uint32_t dispatch_s_sync_sem_base_addr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_S_SYNC_SEM); @@ -1762,8 +1781,8 @@ void Device::update_workers_build_settings(std::vector(dispatch_d_settings); - compile_args[4 + mux_d_input_idx] = packet_switch_4B_pack(dispatch_d_setting.worker_physical_core.x, - dispatch_d_setting.worker_physical_core.y, + compile_args[4 + mux_d_input_idx] = packet_switch_4B_pack(dispatch_d_setting.worker_virtual_core.x, + dispatch_d_setting.worker_virtual_core.y, 1, DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info mux_d_input_idx++; @@ -1773,8 +1792,8 @@ void Device::update_workers_build_settings(std::vector(us_tunneler_remote_settings); - compile_args[4 + mux_d_input_idx] = packet_switch_4B_pack(us_tunneler_remote_setting.worker_physical_core.x, - us_tunneler_remote_setting.worker_physical_core.y, + compile_args[4 + mux_d_input_idx] = packet_switch_4B_pack(us_tunneler_remote_setting.worker_virtual_core.x, + us_tunneler_remote_setting.worker_virtual_core.y, us_tunneler_remote_setting.vc_count * 2 - 1, DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info mux_d_input_idx++; @@ -1785,8 +1804,8 @@ void Device::update_workers_build_settings(std::vector> 4; // 8: remote_tx_queue_start_addr_words compile_args[9] = tunneler_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words - compile_args[10] = tunneler_settings.worker_physical_core.x; // 10: remote_tx_x - compile_args[11] = tunneler_settings.worker_physical_core.y; // 11: remote_tx_y + compile_args[10] = tunneler_settings.worker_virtual_core.x; // 10: remote_tx_x + compile_args[11] = tunneler_settings.worker_virtual_core.y; // 11: remote_tx_y compile_args[12] = tunneler_settings.vc_count - 1; // 12: remote_tx_queue_id compile_args[13] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 13: tx_network_type compile_args[14] = 0; // 14: test_results_addr (disabled) @@ -1867,7 +1886,7 @@ void Device::setup_tunnel_for_remote_devices() { settings.dispatch_core_type = dispatch_core_type; tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(prefetch_location.chip, get_physical_core_coordinate(prefetch_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(prefetch_location.chip, this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp"; //prefetch needs three semaphores. settings.semaphores.push_back(0); @@ -1885,7 +1904,7 @@ void Device::setup_tunnel_for_remote_devices() { for (uint32_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(dispatch_location.chip, get_physical_core_coordinate(dispatch_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(dispatch_location.chip, this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp"; //dispatch needs one semaphore. settings.semaphores.push_back(0); @@ -1935,7 +1954,7 @@ void Device::setup_tunnel_for_remote_devices() { //N300, T3K 1, 2 CQ case settings.semaphores = std::vector(num_prefetchers); tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(mux_location.chip, this->virtual_core_from_logical_core(mux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).mux_buffer_size(num_hw_cqs); settings.cb_start_address = dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(); @@ -1943,7 +1962,7 @@ void Device::setup_tunnel_for_remote_devices() { tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(demux_location.chip, this->virtual_core_from_logical_core(demux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); settings.cb_size_bytes = 0x10000; @@ -1952,7 +1971,7 @@ void Device::setup_tunnel_for_remote_devices() { //TG, TGG 1, 2 CQ case settings.semaphores = std::vector(MAX_SWITCH_FAN_IN); tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(mux_location.chip, this->virtual_core_from_logical_core(mux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.cb_start_address = dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(); settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).mux_buffer_size(1); @@ -1960,12 +1979,12 @@ void Device::setup_tunnel_for_remote_devices() { tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); if (num_prefetchers == 8) { tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 1); - settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(mux_location.chip, this->virtual_core_from_logical_core(mux_location, dispatch_core_type)); tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); } tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(demux_location.chip, this->virtual_core_from_logical_core(demux_location, dispatch_core_type)); settings.semaphores.clear(); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); @@ -1974,14 +1993,14 @@ void Device::setup_tunnel_for_remote_devices() { settings.semaphores = std::vector(num_prefetchers / 2); demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 1); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(demux_location.chip, this->virtual_core_from_logical_core(demux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); settings.cb_size_bytes = 0x10000; tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings)); demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 2); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(demux_location.chip, this->virtual_core_from_logical_core(demux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); settings.cb_size_bytes = 0x10000; @@ -1996,8 +2015,8 @@ void Device::setup_tunnel_for_remote_devices() { tt_cxy_pair us_location = dispatch_core_manager::instance().tunneler_core(us_device, device_id, channel, cq_id); tt_cxy_pair local_location = dispatch_core_manager::instance().us_tunneler_core_local(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(us_location.chip, get_physical_core_coordinate(us_location, CoreType::ETH)); - settings.eth_partner_physical_core = tt_cxy_pair(local_location.chip, get_physical_core_coordinate(local_location, CoreType::ETH)); + settings.worker_virtual_core = tt_cxy_pair(us_location.chip, this->virtual_core_from_logical_core(us_location, CoreType::ETH)); + settings.eth_partner_virtual_core = tt_cxy_pair(local_location.chip, this->virtual_core_from_logical_core(local_location, CoreType::ETH)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp"; settings.cb_start_address = 0x19000; settings.cb_size_bytes = 0x4000; @@ -2007,9 +2026,9 @@ void Device::setup_tunnel_for_remote_devices() { settings.tunnel_stop = tunnel_stop; //swap the two etnernet link pair cores for downstream chip on the link pair. - tt_cxy_pair temp = settings.worker_physical_core; - settings.worker_physical_core = settings.eth_partner_physical_core; - settings.eth_partner_physical_core = temp; + tt_cxy_pair temp = settings.worker_virtual_core; + settings.worker_virtual_core = settings.eth_partner_virtual_core; + settings.eth_partner_virtual_core = temp; settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp"; tunnel_core_allocations[US_TUNNELER_LOCAL].push_back(std::make_tuple(local_location, settings)); TT_ASSERT(us_location.chip == us_device, @@ -2021,7 +2040,7 @@ void Device::setup_tunnel_for_remote_devices() { settings.dispatch_core_type = dispatch_core_type; tt_cxy_pair mux_d_location = dispatch_core_manager::instance().mux_d_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(mux_d_location.chip, get_physical_core_coordinate(mux_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(mux_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_mux.cpp"; settings.semaphores = std::vector(num_hw_cqs); settings.consumer_semaphore_id = 0; @@ -2033,7 +2052,7 @@ void Device::setup_tunnel_for_remote_devices() { uint32_t demux_vcs = settings.vc_count - 1; tt_cxy_pair demux_d_location = dispatch_core_manager::instance().demux_d_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(demux_d_location.chip, get_physical_core_coordinate(demux_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(demux_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.producer_semaphore_id = 0; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); @@ -2045,7 +2064,7 @@ void Device::setup_tunnel_for_remote_devices() { if (tunnel.size() > 2 && demux_vcs > 1) { //TG/TGG 1-2 CQs demux_d_location = dispatch_core_manager::instance().demux_d_core(device_id, channel, 1); - settings.worker_physical_core = tt_cxy_pair(demux_d_location.chip, get_physical_core_coordinate(demux_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(demux_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.producer_semaphore_id = 0; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); @@ -2063,7 +2082,7 @@ void Device::setup_tunnel_for_remote_devices() { settings.producer_semaphore_id = 2; settings.consumer_slave_semaphore_id = 3; tt_cxy_pair prefetch_d_location = dispatch_core_manager::instance().prefetcher_d_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(prefetch_d_location.chip, get_physical_core_coordinate(prefetch_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(prefetch_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp"; settings.cb_start_address = dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(); settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size(); @@ -2084,7 +2103,7 @@ void Device::setup_tunnel_for_remote_devices() { CoreCoord compute_grid_size = this->compute_with_storage_grid_size(); settings.num_compute_cores = uint32_t(compute_grid_size.x * compute_grid_size.y); tt_cxy_pair dispatch_d_location = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(dispatch_d_location.chip, get_physical_core_coordinate(dispatch_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(dispatch_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp"; tunnel_core_allocations[DISPATCH_D].push_back(std::make_tuple(dispatch_d_location, settings)); settings.semaphores.clear(); @@ -2107,7 +2126,8 @@ void Device::setup_tunnel_for_remote_devices() { settings.producer_semaphore_id = 0; // sync with producer (prefetcher) } tt_cxy_pair dispatch_s_location = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(dispatch_s_location.chip, get_physical_core_coordinate(dispatch_s_location, dispatch_core_type)); + auto dispatch_s_virtual_coords = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(dispatch_s_location.chip, dispatch_s_location, dispatch_core_type); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(dispatch_s_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp"; tunnel_core_allocations[DISPATCH_S].push_back(std::make_tuple(dispatch_s_location, settings)); settings.semaphores.clear(); @@ -2172,7 +2192,7 @@ void Device::setup_tunnel_for_remote_devices() { { if (device_worker_variants[dwv].size()) { for (auto &[core, settings] : device_worker_variants[dwv]) { - log_debug(LogMetal, "Tunnel {} Stop {} is Device {}. Core {} - Physical {} will run {}.", t, tunnel_stop, tunnel_device, core.str(), settings.worker_physical_core.str(), magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)dwv)); + log_debug(LogMetal, "Tunnel {} Stop {} is Device {}. Core {} - Physical {} will run {}.", t, tunnel_stop, tunnel_device, core.str(), settings.worker_virtual_core.str(), magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)dwv)); for (uint32_t arg = 0; arg < settings.compile_args.size(); arg++) { log_debug(LogMetal, "CompileArgs[{}] = {}", arg, settings.compile_args[arg]); } @@ -2226,8 +2246,9 @@ void Device::compile_command_queue_programs() { CoreType dispatch_core_type = dispatch_core_config.get_core_type(); tt_cxy_pair prefetch_core = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); tt_cxy_pair dispatch_core = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id); - CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_core, dispatch_core_type); - CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type); + CoreCoord prefetch_virtual_core = this->virtual_core_from_logical_core(prefetch_core, dispatch_core_type); + CoreCoord dispatch_virtual_core = this->virtual_core_from_logical_core(dispatch_core, dispatch_core_type); + uint32_t cq_start = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED); uint32_t command_queue_start_addr = get_absolute_cq_offset(channel, cq_id, cq_size); @@ -2249,14 +2270,14 @@ void Device::compile_command_queue_programs() { // dispatch_s location and flow control vars initialized as invalid. Will be set if dispatch_s is enabled for the given configuration. tt_cxy_pair dispatch_s_core = tt_cxy_pair(0xff, 0xff, 0xff); - CoreCoord dispatch_s_physical_core = {0xff, 0xff}; + CoreCoord dispatch_s_virtual_core = {0xff, 0xff}; uint32_t dispatch_s_buffer_base = 0xff; uint32_t dispatch_s_sem = 0xff; // used by dispatch_s to sync with prefetch uint32_t dispatch_s_sync_sem_base_addr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_S_SYNC_SEM);; // used by dispatch_d to signal that dispatch_s can send go signal if (this->dispatch_s_enabled()) { // Skip allocating dispatch_s for multi-CQ configurations with ethernet dispatch dispatch_s_core = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id); - dispatch_s_physical_core = get_physical_core_coordinate(dispatch_s_core, dispatch_core_type); + dispatch_s_virtual_core = this->virtual_core_from_logical_core(dispatch_s_core, dispatch_core_type); uint32_t dispatch_buffer_base = dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(); if (dispatch_core_type == CoreType::WORKER) { // dispatch_s is on the same Tensix core as dispatch_d. Shared resources. Offset CB start idx. @@ -2270,9 +2291,9 @@ void Device::compile_command_queue_programs() { } log_debug(LogDevice, "Dispatching out of {} cores", magic_enum::enum_name(dispatch_core_type)); - log_debug(LogDevice, "Prefetch HD logical location: {} physical core: {}", prefetch_core.str(), prefetch_physical_core.str()); - log_debug(LogDevice, "Dispatch HD logical location: {} physical core {}", dispatch_core.str(), dispatch_physical_core.str()); - log_debug(LogDevice, "Dispatch S logical location: {} physical core {}", dispatch_s_core.str(), dispatch_s_physical_core.str()); + log_debug(LogDevice, "Prefetch HD logical location: {} virtual core: {}", prefetch_core.str(), prefetch_virtual_core.str()); + log_debug(LogDevice, "Dispatch HD logical location: {} virtual core {}", dispatch_core.str(), dispatch_virtual_core.str()); + log_debug(LogDevice, "Dispatch S logical location: {} virtual core {}", dispatch_s_core.str(), dispatch_s_virtual_core.str()); std::vector prefetch_compile_args = { dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(), @@ -2310,11 +2331,11 @@ void Device::compile_command_queue_programs() { "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp", prefetch_compile_args, prefetch_core, - prefetch_physical_core, + prefetch_virtual_core, dispatch_core_type, CoreCoord{0, 0}, - dispatch_physical_core, - dispatch_s_physical_core, + dispatch_virtual_core, + dispatch_s_virtual_core, std::map {}, my_noc_index, my_noc_index, @@ -2369,11 +2390,11 @@ void Device::compile_command_queue_programs() { "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp", dispatch_compile_args, dispatch_core, - dispatch_physical_core, + dispatch_virtual_core, dispatch_core_type, - prefetch_physical_core, + prefetch_virtual_core, CoreCoord{0, 0}, - dispatch_s_physical_core, + dispatch_s_virtual_core, std::map {}, my_noc_index, dispatch_upstream_noc_index, @@ -2399,10 +2420,10 @@ void Device::compile_command_queue_programs() { "tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp", dispatch_s_compile_args, dispatch_s_core, - dispatch_s_physical_core, + dispatch_s_virtual_core, dispatch_core_type, - prefetch_physical_core, - dispatch_physical_core, + prefetch_virtual_core, + dispatch_virtual_core, CoreCoord{0, 0}, std::map {}, dispatch_s_noc_index, @@ -2466,7 +2487,7 @@ void Device::compile_command_queue_programs() { prefetch_settings.kernel_file, prefetch_settings.compile_args, prefetch_core, - prefetch_settings.worker_physical_core, + prefetch_settings.worker_virtual_core, prefetch_settings.dispatch_core_type, prefetch_settings.upstream_cores[0], prefetch_settings.downstream_cores[0], @@ -2558,7 +2579,7 @@ void Device::compile_command_queue_programs() { dispatch_settings.kernel_file, dispatch_settings.compile_args, dispatch_core, - dispatch_settings.worker_physical_core, + dispatch_settings.worker_virtual_core, dispatch_settings.dispatch_core_type, dispatch_settings.upstream_cores[0], CoreCoord{0xffffffff, 0xffffffff}, @@ -2647,7 +2668,7 @@ void Device::compile_command_queue_programs() { prefetch_d_settings.kernel_file, prefetch_d_settings.compile_args, prefetch_d_core, - prefetch_d_settings.worker_physical_core, + prefetch_d_settings.worker_virtual_core, prefetch_d_settings.dispatch_core_type, prefetch_d_settings.upstream_cores[0], prefetch_d_settings.downstream_cores[0], @@ -2675,7 +2696,7 @@ void Device::compile_command_queue_programs() { dispatch_d_settings.kernel_file, dispatch_d_settings.compile_args, dispatch_d_core, - dispatch_d_settings.worker_physical_core, + dispatch_d_settings.worker_virtual_core, dispatch_d_settings.dispatch_core_type, dispatch_d_settings.upstream_cores[0], dispatch_d_settings.downstream_cores[0], @@ -2698,7 +2719,7 @@ void Device::compile_command_queue_programs() { dispatch_s_settings.kernel_file, dispatch_s_settings.compile_args, dispatch_s_core, - dispatch_s_settings.worker_physical_core, + dispatch_s_settings.worker_virtual_core, dispatch_s_settings.dispatch_core_type, dispatch_s_settings.upstream_cores[0], dispatch_s_settings.downstream_cores[0], @@ -2919,8 +2940,8 @@ void Device::init_command_queue_device() { for (const CoreCoord &logical_dispatch_core : logical_dispatch_cores) { launch_msg_t msg = command_queue_program.kernels_on_core(logical_dispatch_core, index)->launch_msg; go_msg_t go_msg = command_queue_program.kernels_on_core(logical_dispatch_core, index)->go_msg; - CoreCoord phys_core = this->physical_core_from_logical_core(logical_dispatch_core, core_type); - tt::llrt::write_launch_msg_to_core(this->id(), phys_core, &msg, &go_msg, this->get_dev_addr(phys_core, HalL1MemAddrType::LAUNCH)); + CoreCoord virtual_core = this->virtual_core_from_logical_core(logical_dispatch_core, core_type); + tt::llrt::write_launch_msg_to_core(this->id(), virtual_core, &msg, &go_msg, this->get_dev_addr(virtual_core, HalL1MemAddrType::LAUNCH)); } } @@ -2936,8 +2957,8 @@ void Device::init_command_queue_device() { for (const CoreCoord &logical_dispatch_core : logical_dispatch_cores) { launch_msg_t msg = mmio_command_queue_program.kernels_on_core(logical_dispatch_core, index)->launch_msg; go_msg_t go_msg = mmio_command_queue_program.kernels_on_core(logical_dispatch_core, index)->go_msg; - CoreCoord phys_core = mmio_device->physical_core_from_logical_core(logical_dispatch_core, core_type); - tt::llrt::write_launch_msg_to_core(mmio_device_id, phys_core, &msg, &go_msg, mmio_device->get_dev_addr(phys_core, HalL1MemAddrType::LAUNCH)); + CoreCoord virtual_core = mmio_device->virtual_core_from_logical_core(logical_dispatch_core, core_type); + tt::llrt::write_launch_msg_to_core(mmio_device_id, virtual_core, &msg, &go_msg, mmio_device->get_dev_addr(virtual_core, HalL1MemAddrType::LAUNCH)); } } } @@ -3008,7 +3029,7 @@ bool Device::close() { std::unordered_map> not_done_dispatch_cores; std::unordered_map> cores_to_skip; - this->get_associated_dispatch_phys_cores(not_done_dispatch_cores, cores_to_skip); + this->get_associated_dispatch_virtual_cores(not_done_dispatch_cores, cores_to_skip); auto mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id_); std::unordered_set wait_for_cores = not_done_dispatch_cores[mmio_device_id]; @@ -3037,12 +3058,12 @@ bool Device::close() { if (this->id_ != mmio_device_id) { for (auto it = not_done_dispatch_cores[mmio_device_id].begin(); it != not_done_dispatch_cores[mmio_device_id].end(); it++) { - const auto &phys_core = *it; - if(llrt::is_ethernet_core(phys_core, this->id_)) { - log_debug(tt::LogMetal, "Ethernet dispatch core {} on Device {} is idle. Closing Device {}", phys_core.str(), mmio_device_id, this->id()); + const auto &virtual_core = *it; + if(tt::Cluster::instance().is_ethernet_core(virtual_core, this->id_)) { + log_debug(tt::LogMetal, "Ethernet dispatch core {} on Device {} is idle. Closing Device {}", virtual_core.str(), mmio_device_id, this->id()); } else { - log_debug(tt::LogMetal, "Resetting core {} on Device {} when closing Device {}", phys_core.str(), mmio_device_id, this->id()); - tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(mmio_device_id, phys_core)); + log_debug(tt::LogMetal, "Resetting core {} on Device {} when closing Device {}", virtual_core.str(), mmio_device_id, this->id()); + tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(mmio_device_id, virtual_core)); } } } @@ -3102,15 +3123,6 @@ CoreCoord Device::dram_grid_size() const { return tt::Cluster::instance().get_soc_desc(id_).get_dram_grid_size(); } -CoreCoord Device::physical_core_from_logical_core(const CoreCoord &logical_coord, const CoreType &core_type) const { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); - return soc_desc.get_physical_core_from_logical_core(logical_coord, core_type); -} - -CoreCoord Device::physical_core_from_logical_core(const CoreDescriptor &logical_core) const { - return physical_core_from_logical_core(logical_core.coord, logical_core.type); -} - CoreType Device::core_type_from_physical_core(const CoreCoord &physical_coord) const { const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); if (soc_desc.physical_cores.find(physical_coord) == soc_desc.physical_cores.end()) @@ -3119,7 +3131,32 @@ CoreType Device::core_type_from_physical_core(const CoreCoord &physical_coord) c return soc_desc.physical_cores.at(physical_coord).type; } -CoreCoord Device::worker_core_from_logical_core(const CoreCoord &logical_core) const { +CoreType Device::core_type_from_virtual_core(const CoreCoord &virtual_coord) const { + if (tt::Cluster::instance().is_worker_core(virtual_coord, this->id_)) { + return CoreType::WORKER; + } else if (tt::Cluster::instance().is_ethernet_core(virtual_coord, this->id_)) { + return CoreType::ETH; + } + return this->core_type_from_physical_core(virtual_coord); +} + + +CoreCoord Device::virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const { + if (coord.x >= this->grid_size().x || coord.y >= this->grid_size().y) { + // Coordinate already in virtual space: NOC0 and NOC1 are the same + return coord; + } else { + const auto& grid_size = this->grid_size(); + // Coordinate in Physical Space. Convert to Virtual. + CoreCoord phys_coord = { + hal.noc_coordinate(noc_index, grid_size.x, coord.x), + hal.noc_coordinate(noc_index, grid_size.y, coord.y) + }; + return this->virtual_core_from_physical_core(phys_coord, this->core_type_from_physical_core(phys_coord)); + } +} + +CoreCoord Device::physical_worker_core_from_logical_core(const CoreCoord &logical_core) const { const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); return soc_desc.get_physical_tensix_core_from_logical(logical_core); } @@ -3127,66 +3164,64 @@ CoreCoord Device::worker_core_from_logical_core(const CoreCoord &logical_core) c std::vector Device::worker_cores_from_logical_cores(const std::vector &logical_cores) const { std::vector worker_cores(logical_cores.size()); for (std::size_t idx = 0; idx < logical_cores.size(); idx++) - worker_cores[idx] = worker_core_from_logical_core(logical_cores[idx]); + worker_cores[idx] = this->worker_core_from_logical_core(logical_cores[idx]); return worker_cores; } -CoreCoord Device::dram_core_from_logical_core(const CoreCoord &logical_core) const { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); - return soc_desc.get_physical_dram_core_from_logical(logical_core); +std::vector Device::ethernet_cores_from_logical_cores(const std::vector &logical_cores) const { + std::vector eth_cores(logical_cores.size()); + for (std::size_t idx = 0; idx < logical_cores.size(); idx++) { + eth_cores[idx] = this->ethernet_core_from_logical_core(logical_cores[idx]); + } + return eth_cores; } - -std::vector Device::dram_cores_from_logical_cores(const std::vector &logical_cores) const { - std::vector dram_cores(logical_cores.size()); - for (std::size_t idx = 0; idx < logical_cores.size(); idx++) - dram_cores[idx] = dram_core_from_logical_core(logical_cores[idx]); - - return dram_cores; +CoreCoord Device::virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const { + return tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(this->id_, logical_coord, core_type); } -CoreCoord Device::ethernet_core_from_logical_core(const CoreCoord &logical_core) const { - return tt::Cluster::instance().ethernet_core_from_logical_core(id_, logical_core); +CoreCoord Device::virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const { + return tt::Cluster::instance().get_virtual_coordinate_from_physical_coordinates(this->id_, physical_coord, core_type); } -CoreCoord Device::logical_core_from_ethernet_core(const CoreCoord &physical_core) const { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); - return soc_desc.get_logical_ethernet_core_from_physical(physical_core); +CoreCoord Device::worker_core_from_logical_core(const CoreCoord &logical_core) const { + return this->virtual_core_from_logical_core(logical_core, CoreType::WORKER); } -std::vector Device::ethernet_cores_from_logical_cores(const std::vector &logical_cores) const { - std::vector ethernet_cores(logical_cores.size()); +CoreCoord Device::ethernet_core_from_logical_core(const CoreCoord &logical_core) const { + return this->virtual_core_from_logical_core(logical_core, CoreType::ETH); +} - for (std::size_t idx = 0; idx < logical_cores.size(); idx++) - ethernet_cores[idx] = ethernet_core_from_logical_core(logical_cores[idx]); - return ethernet_cores; +CoreCoord Device::logical_core_from_ethernet_core(const CoreCoord ðernet_core) const { + return tt::Cluster::instance().get_logical_ethernet_core_from_virtual(this->id(), ethernet_core); } -uint32_t Device::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& physical_core) const { - const auto& grid_size = this->grid_size(); +uint32_t Device::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const { + auto virtual_noc_coord = this->virtual_noc_coordinate(noc_index, core); return tt::tt_metal::hal.noc_xy_encoding( - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_core.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_core.y) + virtual_noc_coord.x, + virtual_noc_coord.y ); } -uint32_t Device::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const { - const auto& grid_size = this->grid_size(); +uint32_t Device::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const { + auto virtual_noc_start = this->virtual_noc_coordinate(noc_index, cores.start_coord); + auto virtual_noc_end = this->virtual_noc_coordinate(noc_index, cores.end_coord); // NOC 1 mcasts from bottom left to top right, so we need to reverse the coords if (noc_index == 0) { return tt::tt_metal::hal.noc_multicast_encoding( - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_cores.start_coord.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_cores.start_coord.y), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_cores.end_coord.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_cores.end_coord.y) + virtual_noc_start.x, + virtual_noc_start.y, + virtual_noc_end.x, + virtual_noc_end.y ); } else { return tt::tt_metal::hal.noc_multicast_encoding( - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_cores.end_coord.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_cores.end_coord.y), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_cores.start_coord.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_cores.start_coord.y) + virtual_noc_end.x, + virtual_noc_end.y, + virtual_noc_start.x, + virtual_noc_start.y ); } } @@ -3613,8 +3648,9 @@ void Device::generate_device_bank_to_noc_tables() l1_bank_to_noc_xy_.reserve(tt::tt_metal::hal.get_num_nocs() * l1_noc_coord_per_bank.size()); for (unsigned int noc = 0; noc < tt::tt_metal::hal.get_num_nocs(); noc++) { for (unsigned int bank_id = 0; bank_id < l1_noc_coord_per_bank.size(); bank_id++) { - uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, l1_noc_coord_per_bank[bank_id].x); - uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, l1_noc_coord_per_bank[bank_id].y); + auto l1_noc_coords = this->virtual_noc_coordinate(noc, l1_noc_coord_per_bank[bank_id]); + uint16_t noc_x = l1_noc_coords.x; + uint16_t noc_y = l1_noc_coords.y; uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << NOC_COORD_REG_OFFSET; l1_bank_to_noc_xy_.push_back(xy); } @@ -3706,6 +3742,60 @@ const std::vector &Device::get_sub_device_ids() const { return this->active_sub_device_manager_->get_sub_device_ids(); } +std::vector Device::get_optimal_dram_bank_to_logical_worker_assignment() { + // Top level function that users (ex: Op Writers) can use to assign Tensix Worker cores + // as DRAM readers or writers. Returns logical coordinates of optimally placed workers. + // This function queries Physical Coordinates (only exposed directly to the Device class) + // and passes them to logic in core_assignment.cpp to derive the most optimal core placement + // based on architecture specific logic and Physical Grid configuration. + if (not this->optimal_dram_bank_to_logical_worker_assignment_.size()) { + uint32_t full_grid_size_x = this->grid_size().x; + uint32_t full_grid_size_y = this->grid_size().y; + + auto compute_with_storage_grid_size = this->compute_with_storage_grid_size(); + uint32_t num_cores_x = compute_with_storage_grid_size.x; + uint32_t num_cores_y = compute_with_storage_grid_size.y; + // Get physical coordinates of DRAM Controller NOC end-points + uint32_t num_dram_banks = this->num_dram_channels(); + std::vector dram_phy_coords; + for (int i = 0; i < num_dram_banks; ++i) { + dram_phy_coords.push_back(dram_core_from_dram_channel(i)); + } + // Get all logical cores in the worker grid + std::vector all_worker_cores_logical; + for (int i = 0; i < num_cores_x; ++i) { + for (int j = 0; j < num_cores_y; ++j) { + all_worker_cores_logical.push_back(CoreCoord(i, j)); + } + } + // Get the physical rows and cols (y, x) in the worker grid + std::vector worker_phy_y = std::vector(num_cores_y); + for (int i = 0; i < num_cores_y; ++i) { + auto core_phy = this->physical_worker_core_from_logical_core(CoreCoord(0, i)); + worker_phy_y.at(i) = core_phy.y; + } + std::vector worker_phy_x = std::vector(num_cores_x); + for (int i = 0; i < num_cores_x; ++i) { + auto core_phy = this->physical_worker_core_from_logical_core(CoreCoord(i, 0)); + worker_phy_x.push_back(core_phy.x); + } + // Get optimal placement of worker cores interfacing with DRAM Controllers in physical coordinate space + auto physical_worker_cores = get_optimal_dram_to_physical_worker_assignment(this->arch(), dram_phy_coords, full_grid_size_x, full_grid_size_y, worker_phy_x, worker_phy_y); + // Convert to physical worker coordinates to logical. This gets returned to the user. + for (int i = 0; i < physical_worker_cores.size(); ++i) { + for (int j = 0; j < all_worker_cores_logical.size(); ++j) { + auto core = this->physical_worker_core_from_logical_core(all_worker_cores_logical[j]); + if (physical_worker_cores[i] == core) { + this->optimal_dram_bank_to_logical_worker_assignment_.push_back(all_worker_cores_logical[j]); + } + } + } + } + return this->optimal_dram_bank_to_logical_worker_assignment_; +} + + + size_t v1::GetNumAvailableDevices() { return tt::Cluster::instance().number_of_user_devices(); } size_t v1::GetNumPCIeDevices() { return tt::Cluster::instance().number_of_pci_devices(); } diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 616a831e0462..f2ef56a3a830 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -64,6 +64,12 @@ class Device { private: static_assert(detail::SubDeviceManager::MAX_NUM_SUB_DEVICES <= dispatch_constants::DISPATCH_MESSAGE_ENTRIES, "MAX_NUM_SUB_DEVICES must be less than or equal to dispatch_constants::DISPATCH_MESSAGE_ENTRIES"); static constexpr uint32_t DEFAULT_NUM_SUB_DEVICES = 1; + + CoreCoord physical_worker_core_from_logical_core(const CoreCoord &logical_core) const; + CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const; + CoreType core_type_from_physical_core(const CoreCoord &physical_core) const; + CoreCoord virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const; + public: // friend void tt_gdb(Device* device, int chip_id, const vector cores, vector ops); Device () = delete; @@ -109,22 +115,21 @@ class Device { CoreCoord dram_grid_size() const; - CoreCoord physical_core_from_logical_core(const CoreCoord &logical_core, const CoreType &core_type) const; - CoreCoord physical_core_from_logical_core(const CoreDescriptor &logical_core) const; - CoreType core_type_from_physical_core(const CoreCoord &physical_core) const; + CoreType core_type_from_virtual_core(const CoreCoord& virtual_coord) const; + + CoreCoord virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const; - CoreCoord worker_core_from_logical_core(const CoreCoord &logical_core) const; std::vector worker_cores_from_logical_cores(const std::vector &logical_cores) const; + std::vector ethernet_cores_from_logical_cores(const std::vector &logical_cores) const; + std::vector get_optimal_dram_bank_to_logical_worker_assignment(); - CoreCoord dram_core_from_logical_core(const CoreCoord &logical_core) const; - std::vector dram_cores_from_logical_cores(const std::vector &logical_cores) const; + CoreCoord virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const; + + CoreCoord worker_core_from_logical_core(const CoreCoord &logical_core) const; // Ethernet API CoreCoord ethernet_core_from_logical_core(const CoreCoord &logical_core) const; - CoreCoord logical_core_from_ethernet_core(const CoreCoord &physical_core) const; - - std::vector ethernet_cores_from_logical_cores(const std::vector &logical_cores) const; - std::vector get_noc_encoding_for_active_eth_cores(NOC noc_index); + CoreCoord logical_core_from_ethernet_core(const CoreCoord ðernet_core) const; std::unordered_set get_ethernet_connected_device_ids() const { return tt::Cluster::instance().get_ethernet_connected_device_ids(this->id_); @@ -167,7 +172,6 @@ class Device { uint32_t dram_channel_from_bank_id(uint32_t bank_id) const; uint32_t dram_channel_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const; - CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const; CoreCoord logical_core_from_dram_channel(uint32_t dram_channel) const; uint32_t dram_channel_from_logical_core(const CoreCoord& logical_core) const; @@ -210,8 +214,9 @@ class Device { // core.y represents different channels along one const std::set ðernet_cores() const { return this->ethernet_cores_; } - uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& physical_core) const; - uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const; + + uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const; + uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const; const std::unordered_set &get_allocated_buffers() const; const std::unordered_set &get_allocated_buffers(SubDeviceId sub_device_id) const; @@ -259,19 +264,19 @@ class Device { void initialize_build(); void initialize_device_kernel_defines(); void build_firmware(); - void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord phys_core); - void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg); + void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core); + void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg); void reset_cores(); void initialize_and_launch_firmware(); void init_command_queue_host(); void init_command_queue_device(); void initialize_synchronous_sw_cmd_queue(); - void configure_kernel_variant(Program& program, const string& path, const std::vector& compile_args, CoreCoord kernel_core, CoreCoord Kernel_physical_core, - CoreType dispatch_core_type, CoreCoord upstream_physical_core, CoreCoord downstream_physical_core, CoreCoord downstream_slave_physical_core, std::map defines_in, NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index, bool is_active_eth_core = false, bool send_to_brisc = false, bool force_watcher_no_inline = false); + void configure_kernel_variant(Program& program, const string& path, const std::vector& compile_args, CoreCoord kernel_core, CoreCoord kernel_virtual_core, + CoreType dispatch_core_type, CoreCoord upstream_virtual_core, CoreCoord downstream_virtual_core, CoreCoord downstream_slave_virtual_core, std::map defines_in, NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index, bool is_active_eth_core = false, bool send_to_brisc = false, bool force_watcher_no_inline = false); void compile_command_queue_programs(); void configure_command_queue_programs(); void clear_l1_state(); - void get_associated_dispatch_phys_cores( + void get_associated_dispatch_virtual_cores( std::unordered_map> &my_dispatch_cores, std::unordered_map> &other_dispatch_cores); std::pair build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const; @@ -315,7 +320,7 @@ class Device { std::set compute_cores_; std::set storage_only_cores_; std::set ethernet_cores_; - + std::vector optimal_dram_bank_to_logical_worker_assignment_; // SystemMemoryManager is the interface to the hardware command queue std::vector> hw_command_queues_; std::vector> sw_command_queues_; @@ -354,9 +359,9 @@ class Device { uint32_t trace_buffers_size = 0; void update_dispatch_cores_for_multi_cq_eth_dispatch(); - HalProgrammableCoreType get_programmable_core_type(CoreCoord phys_core) const; + HalProgrammableCoreType get_programmable_core_type(CoreCoord virtual_core) const; template - T get_dev_addr(CoreCoord phys_core, HalL1MemAddrType addr_type) const; + T get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const; // Returns address where allocator starts allocating buffer template T get_base_allocator_addr(const HalMemType &mem_type) const; @@ -406,12 +411,12 @@ class Device { } // namespace v0 -inline HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord phys_core) const { +inline HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord virtual_core) const { HalProgrammableCoreType programmable_core_type = HalProgrammableCoreType::TENSIX; - if (tt::llrt::is_ethernet_core(phys_core, this->id_)) { + if (tt::Cluster::instance().is_ethernet_core(virtual_core, this->id_)) { // Eth pcores have a different address, but only active ones. - CoreCoord logical_core = this->logical_core_from_ethernet_core(phys_core); + CoreCoord logical_core = this->logical_core_from_ethernet_core(virtual_core); if (this->is_active_ethernet_core(logical_core)) { programmable_core_type = HalProgrammableCoreType::ACTIVE_ETH; } else { @@ -423,8 +428,8 @@ inline HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord phys } template -inline T Device::get_dev_addr(CoreCoord phys_core, HalL1MemAddrType addr_type) const { - return hal.get_dev_addr(this->get_programmable_core_type(phys_core), addr_type); +inline T Device::get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const { + return hal.get_dev_addr(this->get_programmable_core_type(virtual_core), addr_type); } template @@ -446,11 +451,11 @@ std::vector> Device::extract_dst_noc_mu std::vector> dst_noc_multicast_info; dst_noc_multicast_info.reserve(ranges.size()); for (const CoreRange& core_range : ranges) { - CoreCoord physical_start = this->physical_core_from_logical_core(core_range.start_coord, core_type); - CoreCoord physical_end = this->physical_core_from_logical_core(core_range.end_coord, core_type); + CoreCoord virtual_start = this->virtual_core_from_logical_core(core_range.start_coord, core_type); + CoreCoord virtual_end = this->virtual_core_from_logical_core(core_range.end_coord, core_type); uint32_t num_receivers = core_range.size(); - dst_noc_multicast_info.push_back(std::make_pair(CoreRange(physical_start, physical_end), num_receivers)); + dst_noc_multicast_info.push_back(std::make_pair(CoreRange(virtual_start, virtual_end), num_receivers)); } return dst_noc_multicast_info; } diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index e0ef8b96cfc4..d677a362cd72 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -100,10 +100,10 @@ void EnqueueReadInterleavedBufferCommand::add_prefetch_relay(HugepageDeviceComma void EnqueueReadShardedBufferCommand::add_prefetch_relay(HugepageDeviceCommand& command) { uint32_t padded_page_size = this->buffer.aligned_page_size(); - const CoreCoord physical_core = - this->buffer.device()->physical_core_from_logical_core(this->core, this->buffer.core_type()); + const CoreCoord virtual_core = + this->buffer.device()->virtual_core_from_logical_core(this->core, this->buffer.core_type()); command.add_prefetch_relay_linear( - this->device->get_noc_unicast_encoding(this->noc_index, physical_core), + this->device->get_noc_unicast_encoding(this->noc_index, virtual_core), padded_page_size * this->pages_to_read, this->bank_base_address); } @@ -240,13 +240,13 @@ void EnqueueWriteInterleavedBufferCommand::add_buffer_data(HugepageDeviceCommand void EnqueueWriteShardedBufferCommand::add_dispatch_write(HugepageDeviceCommand& command_sequence) { uint32_t data_size_bytes = this->pages_to_write * this->padded_page_size; - const CoreCoord physical_core = - this->buffer.device()->physical_core_from_logical_core(this->core, this->buffer.core_type()); + const CoreCoord virtual_core = + this->buffer.device()->virtual_core_from_logical_core(this->core, this->buffer.core_type()); bool flush_prefetch = true; command_sequence.add_dispatch_write_linear( flush_prefetch, 0, - this->device->get_noc_unicast_encoding(this->noc_index, physical_core), + this->device->get_noc_unicast_encoding(this->noc_index, virtual_core), this->bank_base_address, data_size_bytes); } @@ -656,10 +656,9 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc } } } - - CoreCoord physical_core = device->physical_core_from_logical_core(core_coord, core_type); + CoreCoord virtual_core = device->virtual_core_from_logical_core(core_coord, core_type); unique_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, physical_core)}); + .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, virtual_core)}); } } } @@ -720,9 +719,9 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc unicast_sub_cmd.reserve(kernel->logical_cores().size()); for (auto& core_coord : kernel->logical_cores()) { // can make a vector of unicast encodings here - CoreCoord physical_core = device->ethernet_core_from_logical_core(core_coord); + CoreCoord virtual_core_coords = device->virtual_core_from_logical_core(core_coord, CoreType::ETH); unicast_sub_cmd.emplace_back(CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, physical_core)}); + .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, virtual_core_coords)}); } } else { std::vector> dst_noc_multicast_info = @@ -890,8 +889,8 @@ void EnqueueProgramCommand::assemble_device_commands( uint32_t max_overall_index = 0; uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t); for (const CoreRange& core_range : circular_buffers_unique_coreranges) { - const CoreCoord physical_start = device->worker_core_from_logical_core(core_range.start_coord); - const CoreCoord physical_end = device->worker_core_from_logical_core(core_range.end_coord); + const CoreCoord virtual_start = device->virtual_core_from_logical_core(core_range.start_coord, CoreType::WORKER); + const CoreCoord virtual_end = device->virtual_core_from_logical_core(core_range.end_coord, CoreType::WORKER); const uint32_t num_receivers = core_range.size(); auto& cb_config_payload = cb_config_payloads[i]; @@ -924,7 +923,7 @@ void EnqueueProgramCommand::assemble_device_commands( } multicast_cb_config_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{ .noc_xy_addr = this->device->get_noc_multicast_encoding( - this->noc_index, CoreRange(physical_start, physical_end)), + this->noc_index, CoreRange(virtual_start, virtual_end)), .num_mcast_dests = (uint32_t)core_range.size()}); multicast_cb_config_data.emplace_back(cb_config_payload.data(), max_index * sizeof(uint32_t)); max_overall_index = std::max(max_overall_index, max_index); @@ -1089,14 +1088,12 @@ void EnqueueProgramCommand::assemble_device_commands( kernel_group.launch_msg.kernel_config.host_assigned_id = program.get_runtime_id(); const void* launch_message_data = (const void*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { - CoreCoord physical_start = - device->physical_core_from_logical_core(core_range.start_coord, kernel_group.get_core_type()); - CoreCoord physical_end = - device->physical_core_from_logical_core(core_range.end_coord, kernel_group.get_core_type()); + CoreCoord virtual_start = device->virtual_core_from_logical_core(core_range.start_coord, kernel_group.get_core_type()); + CoreCoord virtual_end = device->virtual_core_from_logical_core(core_range.end_coord, kernel_group.get_core_type()); multicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{ .noc_xy_addr = this->device->get_noc_multicast_encoding( - this->noc_index, CoreRange(physical_start, physical_end)), + this->noc_index, CoreRange(virtual_start, virtual_end)), .num_mcast_dests = (uint32_t)core_range.size()}); multicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB); } @@ -1123,11 +1120,11 @@ void EnqueueProgramCommand::assemble_device_commands( for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord physical_coord = device->physical_core_from_logical_core( + CoreCoord virtual_coord = device->virtual_core_from_logical_core( CoreCoord({x, y}), kernel_group.get_core_type()); unicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{ .noc_xy_addr = - this->device->get_noc_unicast_encoding(this->noc_index, physical_coord)}); + this->device->get_noc_unicast_encoding(this->noc_index, virtual_coord)}); unicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB); } } @@ -1738,9 +1735,9 @@ void EnqueueRecordEventCommand::process() { dispatch_location = dispatch_core_manager::instance().dispatcher_d_core(this->device->id(), channel, cq_id); } - CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_location, core_type); + CoreCoord dispatch_virtual_core = this->device->virtual_core_from_logical_core(dispatch_location, core_type); unicast_sub_cmds[cq_id] = CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, dispatch_physical_core)}; + .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, dispatch_virtual_core)}; event_payloads[cq_id] = {event_payload.data(), event_payload.size() * sizeof(uint32_t)}; } @@ -1978,8 +1975,8 @@ HWCommandQueue::HWCommandQueue(Device* device, uint32_t id, NOC noc_index) : enqueue_program_dispatch_core = dispatch_core_manager::instance().dispatcher_d_core(device->id(), channel, id); } } - this->physical_enqueue_program_dispatch_core = - device->physical_core_from_logical_core(enqueue_program_dispatch_core, core_type); + this->virtual_enqueue_program_dispatch_core = + device->virtual_core_from_logical_core(enqueue_program_dispatch_core, core_type); tt_cxy_pair completion_q_writer_location = dispatch_core_manager::instance().completion_queue_writer_core(device->id(), channel, this->id); @@ -2057,8 +2054,8 @@ void HWCommandQueue::reset_worker_state(bool reset_launch_msg_state) { } go_msg_t reset_launch_message_read_ptr_go_signal; reset_launch_message_read_ptr_go_signal.signal = RUN_MSG_RESET_READ_PTR; - reset_launch_message_read_ptr_go_signal.master_x = (uint8_t)this->physical_enqueue_program_dispatch_core.x; - reset_launch_message_read_ptr_go_signal.master_y = (uint8_t)this->physical_enqueue_program_dispatch_core.y; + reset_launch_message_read_ptr_go_signal.master_x = (uint8_t)this->virtual_enqueue_program_dispatch_core.x; + reset_launch_message_read_ptr_go_signal.master_y = (uint8_t)this->virtual_enqueue_program_dispatch_core.y; for (uint8_t i = 0; i < num_sub_devices; ++i) { reset_launch_message_read_ptr_go_signal.dispatch_message_offset = (uint8_t)dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(i); uint32_t dispatch_message_addr = dispatch_message_base_addr + dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(i); @@ -2534,7 +2531,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { this->device, this->noc_index, program, - this->physical_enqueue_program_dispatch_core, + this->virtual_enqueue_program_dispatch_core, this->manager, this->get_config_buffer_mgr(sub_device_index), expected_workers_completed, @@ -2627,7 +2624,7 @@ void HWCommandQueue::enqueue_trace(const uint32_t trace_id, bool blocking) { auto trace_inst = this->device->get_trace(trace_id); auto command = EnqueueTraceCommand( - this->id, this->device, this->manager, trace_inst->desc, *trace_inst->buffer, this->expected_num_workers_completed, this->noc_index, this->physical_enqueue_program_dispatch_core); + this->id, this->device, this->manager, trace_inst->desc, *trace_inst->buffer, this->expected_num_workers_completed, this->noc_index, this->virtual_enqueue_program_dispatch_core); this->enqueue_command(command, false, {}); diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index 2ef1e886def1..661e3d6d4992 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -516,7 +516,7 @@ class HWCommandQueue { ~HWCommandQueue(); - CoreCoord physical_enqueue_program_dispatch_core; + CoreCoord virtual_enqueue_program_dispatch_core; CoreCoord completion_queue_writer_core; NOC noc_index; volatile bool is_dprint_server_hung(); diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index 4c1c3d231b7a..8204be59bb8f 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -479,19 +479,25 @@ class SystemMemoryManager { for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { tt_cxy_pair prefetcher_core = tt::tt_metal::dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); - tt_cxy_pair prefetcher_physical_core = - tt_cxy_pair(prefetcher_core.chip, tt::get_physical_core_coordinate(prefetcher_core, core_type)); - this->prefetcher_cores[cq_id] = prefetcher_physical_core; + auto prefetcher_virtual = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(prefetcher_core.chip, CoreCoord(prefetcher_core.x, prefetcher_core.y), core_type); + this->prefetcher_cores[cq_id] = tt_cxy_pair(prefetcher_core.chip, prefetcher_virtual.x, prefetcher_virtual.y); this->prefetch_q_writers.emplace_back( - tt::Cluster::instance().get_static_tlb_writer(prefetcher_physical_core)); + tt::Cluster::instance().get_static_tlb_writer(this->prefetcher_cores[cq_id])); tt_cxy_pair completion_queue_writer_core = tt::tt_metal::dispatch_core_manager::instance().completion_queue_writer_core(device_id, channel, cq_id); + auto completion_queue_writer_virtual = + tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates( + completion_queue_writer_core.chip, + CoreCoord(completion_queue_writer_core.x, completion_queue_writer_core.y), + core_type); + const std::tuple completion_interface_tlb_data = tt::Cluster::instance() .get_tlb_data(tt_cxy_pair( completion_queue_writer_core.chip, - tt::get_physical_core_coordinate(completion_queue_writer_core, core_type))) + completion_queue_writer_virtual.x, + completion_queue_writer_virtual.y)) .value(); auto [completion_tlb_offset, completion_tlb_size] = completion_interface_tlb_data; this->completion_byte_addrs[cq_id] = completion_tlb_offset + completion_q_rd_ptr % completion_tlb_size; diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.hpp b/tt_metal/impl/dispatch/dispatch_core_manager.hpp index db6118dffec7..3425388d9fb3 100644 --- a/tt_metal/impl/dispatch/dispatch_core_manager.hpp +++ b/tt_metal/impl/dispatch/dispatch_core_manager.hpp @@ -28,8 +28,8 @@ struct dispatch_worker_build_settings_t{ std::vector compile_args; std::vector upstream_cores; std::vector downstream_cores; - tt_cxy_pair worker_physical_core; - tt_cxy_pair eth_partner_physical_core; + tt_cxy_pair worker_virtual_core; + tt_cxy_pair eth_partner_virtual_core; CoreType dispatch_core_type; uint32_t command_queue_start_addr; uint32_t issue_queue_start_addr; diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 49877dcf9ae1..647a6b484027 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -60,7 +60,8 @@ constexpr uint32_t downstream_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_NOC_X constexpr uint32_t dispatch_s_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_SLAVE_NOC_X, DOWNSTREAM_SLAVE_NOC_Y)); constexpr uint8_t my_noc_index = NOC_INDEX; constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y)); -constexpr uint64_t pcie_noc_xy = uint64_t(NOC_XY_PCIE_ENCODING(NOC_X(PCIE_NOC_X), NOC_Y(PCIE_NOC_Y))); +constexpr uint64_t pcie_noc_xy = + uint64_t(NOC_XY_PCIE_ENCODING(NOC_X_PHYS_COORD(PCIE_NOC_X), NOC_Y_PHYS_COORD(PCIE_NOC_Y))); constexpr uint32_t dispatch_cb_page_size = 1 << dispatch_cb_log_page_size; constexpr uint32_t completion_queue_end_addr = completion_queue_base_addr + completion_queue_size; @@ -947,7 +948,7 @@ static inline bool process_cmd_d( switch (cmd->base.cmd_id) { case CQ_DISPATCH_CMD_WRITE_LINEAR: WAYPOINT("DWB"); - DPRINT << "cmd_write\n"; + DPRINT << "cmd_write_linear\n"; process_write(block_noc_writes_to_clear, block_next_start_addr); WAYPOINT("DWD"); break; diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 4e4d7ce297cc..02cbbc964401 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -69,7 +69,8 @@ constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y)); constexpr uint32_t upstream_noc_xy = uint32_t(NOC_XY_ENCODING(UPSTREAM_NOC_X, UPSTREAM_NOC_Y)); constexpr uint32_t downstream_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_NOC_X, DOWNSTREAM_NOC_Y)); constexpr uint32_t dispatch_s_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_SLAVE_NOC_X, DOWNSTREAM_SLAVE_NOC_Y)); -constexpr uint64_t pcie_noc_xy = uint64_t(NOC_XY_PCIE_ENCODING(NOC_X(PCIE_NOC_X), NOC_Y(PCIE_NOC_Y))); +constexpr uint64_t pcie_noc_xy = + uint64_t(NOC_XY_PCIE_ENCODING(NOC_X_PHYS_COORD(PCIE_NOC_X), NOC_Y_PHYS_COORD(PCIE_NOC_Y))); constexpr uint32_t downstream_cb_page_size = 1 << downstream_cb_log_page_size; constexpr uint32_t dispatch_s_cb_page_size = 1 << dispatch_s_cb_log_page_size; constexpr uint32_t downstream_cb_end = downstream_cb_base + (1 << downstream_cb_log_page_size) * downstream_cb_pages; diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 216ffcae5b32..e76abaf56513 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -845,7 +845,7 @@ void detail::Program_::init_semaphores(const Device &device, const CoreCoord &lo for (auto semaphore : semaphores_on_core) { llrt::write_hex_vec_to_core( device.id(), - device.physical_core_from_logical_core(logical_core, core_type), + device.virtual_core_from_logical_core(logical_core, core_type), std::vector{semaphore.get().initial_value()}, addr + semaphore.get().offset()); } @@ -991,8 +991,8 @@ void detail::Program_::populate_dispatch_data(Device *device) { for (const CoreRange &core_range : ranges) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord physical_coord = device->physical_core_from_logical_core(CoreCoord({x, y}), core_type); - dst_noc_unicast_info.push_back(std::make_pair(physical_coord, /*num_mcast_dests=*/0)); + CoreCoord virtual_coord = device->virtual_core_from_logical_core(CoreCoord({x, y}), core_type); + dst_noc_unicast_info.push_back(std::make_pair(virtual_coord, /*num_mcast_dests=*/0)); } } } @@ -1559,7 +1559,6 @@ void detail::Program_::compile(Device *device, bool fd_bootloader_mode) { TT_FATAL(not on_dispatch_core, "Illegal kernel placement for {}, Kernels cannot be placed on dispatch cores!", kernel->name()); } }; - for (auto & kernels : kernels_) { for (auto &[id, kernel] : kernels) { validate_kernel_placement(kernel); @@ -1629,8 +1628,8 @@ void Program::set_runtime_id(uint64_t id) { pimpl_->set_runtime_id(id); } uint32_t detail::Program_::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); - HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + CoreCoord virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(virtual_core); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); const auto &sub_device_ids = this->determine_sub_device_ids(device); // TODO: This restriction can be lifted once we have support for programs spanning multiple sub-devices @@ -1651,8 +1650,8 @@ uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, Core uint32_t detail::Program_::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); - HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + CoreCoord virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(virtual_core); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); const auto &sub_device_ids = this->determine_sub_device_ids(device); // TODO: This restriction can be lifted once this function is changed to return a vector of addresses @@ -1681,8 +1680,8 @@ void Program::set_last_used_command_queue_for_testing(HWCommandQueue *queue) { uint32_t detail::Program_::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); - HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + CoreCoord virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(virtual_core); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); return this->program_configs_[index].sem_size; @@ -1694,8 +1693,8 @@ uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType uint32_t detail::Program_::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); - HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + CoreCoord virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(virtual_core); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); return this->program_configs_[index].cb_size; diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp index d2297c894a46..c1500f850640 100644 --- a/tt_metal/impl/sub_device/sub_device_manager.cpp +++ b/tt_metal/impl/sub_device/sub_device_manager.cpp @@ -257,8 +257,8 @@ void SubDeviceManager::populate_sub_allocators() { .l1_small_size = 0, .trace_region_size = 0, .core_type_from_noc_coord_table = {}, // Populated later - .worker_log_to_physical_routing_x = global_allocator_config.worker_log_to_physical_routing_x, - .worker_log_to_physical_routing_y = global_allocator_config.worker_log_to_physical_routing_y, + .worker_log_to_virtual_routing_x = global_allocator_config.worker_log_to_virtual_routing_x, + .worker_log_to_virtual_routing_y = global_allocator_config.worker_log_to_virtual_routing_y, .l1_bank_remap = std::move(l1_bank_remap), .compute_grid = compute_cores, .alignment = global_allocator_config.alignment, @@ -303,12 +303,12 @@ void SubDeviceManager::populate_noc_data() { this->num_noc_mcast_txns_[i] = tensix_cores.size(); this->noc_mcast_unicast_data_.resize(idx + this->num_noc_mcast_txns_[i] * 2); for (const auto& core_range : tensix_cores.ranges()) { - auto physical_start = - this->device_->physical_core_from_logical_core(core_range.start_coord, CoreType::WORKER); - auto physical_end = this->device_->physical_core_from_logical_core(core_range.end_coord, CoreType::WORKER); - auto physical_core_range = CoreRange(physical_start, physical_end); + auto virtual_start = + this->device_->virtual_core_from_logical_core(core_range.start_coord, CoreType::WORKER); + auto virtual_end = this->device_->virtual_core_from_logical_core(core_range.end_coord, CoreType::WORKER); + auto virtual_core_range = CoreRange(virtual_start, virtual_end); this->noc_mcast_unicast_data_[idx++] = - this->device_->get_noc_multicast_encoding(noc_index, physical_core_range); + this->device_->get_noc_multicast_encoding(noc_index, virtual_core_range); this->noc_mcast_unicast_data_[idx++] = core_range.size(); } this->noc_unicast_data_start_index_[i] = idx; @@ -317,9 +317,8 @@ void SubDeviceManager::populate_noc_data() { for (const auto& core_range : eth_cores.ranges()) { this->noc_mcast_unicast_data_.resize(idx + core_range.size()); for (const auto& core : core_range) { - auto physical_core = this->device_->physical_core_from_logical_core(core, CoreType::ETH); - this->noc_mcast_unicast_data_[idx++] = - this->device_->get_noc_unicast_encoding(noc_index, physical_core); + auto virtual_core = this->device_->virtual_core_from_logical_core(core, CoreType::ETH); + this->noc_mcast_unicast_data_[idx++] = this->device_->get_noc_unicast_encoding(noc_index, virtual_core); } } this->num_noc_unicast_txns_[i] = idx - this->noc_unicast_data_start_index_[i]; diff --git a/tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp b/tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp index 54162f862319..5aed4b52ac0c 100644 --- a/tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp +++ b/tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp @@ -6,14 +6,12 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); - uint32_t src1_noc_x = get_arg_val(5); - uint32_t src1_noc_y = get_arg_val(6); - uint32_t src1_num_tiles = get_arg_val(7); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_bank_id = get_arg_val(4); + uint32_t src1_num_tiles = get_arg_val(5); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; @@ -31,7 +29,7 @@ void kernel_main() { // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { if (i < src0_num_tiles) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); l1_write_addr_in0 = get_write_ptr(cb_id_in0); @@ -46,7 +44,7 @@ void kernel_main() { } if (i < src1_num_tiles) { - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); cb_reserve_back(cb_id_in1, ublock_size_tiles); l1_write_addr_in1 = get_write_ptr(cb_id_in1); diff --git a/tt_metal/kernels/dataflow/reader_unary.cpp b/tt_metal/kernels/dataflow/reader_unary.cpp index 37ed368b8bb9..a79c750cb977 100644 --- a/tt_metal/kernels/dataflow/reader_unary.cpp +++ b/tt_metal/kernels/dataflow/reader_unary.cpp @@ -7,10 +7,9 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); constexpr uint32_t cb_id_in0 = 0; @@ -20,7 +19,7 @@ void kernel_main() { // read a ublock of tiles from src to CB, and then push the ublock to unpacker for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id_in0); diff --git a/tt_metal/kernels/dataflow/writer_unary.cpp b/tt_metal/kernels/dataflow/writer_unary.cpp index 61c3cdc08d06..adddd7b20b3a 100644 --- a/tt_metal/kernels/dataflow/writer_unary.cpp +++ b/tt_metal/kernels/dataflow/writer_unary.cpp @@ -3,12 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 #include "dataflow_api.h" +#include "debug/dprint.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t dst_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16; @@ -17,7 +17,7 @@ void kernel_main() { uint32_t ublock_size_tiles = 1; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(bank_id, dst_addr); cb_wait_front(cb_id_out0, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id_out0); diff --git a/tt_metal/kernels/dataflow/writer_unary_1.cpp b/tt_metal/kernels/dataflow/writer_unary_1.cpp new file mode 100644 index 000000000000..2ee5486e851c --- /dev/null +++ b/tt_metal/kernels/dataflow/writer_unary_1.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_x = get_arg_val(1); + uint32_t dst_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + constexpr uint32_t cb_id_out0 = 16; + + // single-tile ublocks + uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); + uint32_t ublock_size_tiles = 1; + + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t dst_noc_addr = get_noc_addr(dst_x, dst_y, dst_addr); + cb_wait_front(cb_id_out0, ublock_size_tiles); + uint32_t l1_read_addr = get_read_ptr(cb_id_out0); + noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); + + noc_async_write_barrier(); + + cb_pop_front(cb_id_out0, ublock_size_tiles); + dst_addr += ublock_size_bytes; + } +} diff --git a/tt_metal/llrt/blackhole/bh_hal.cpp b/tt_metal/llrt/blackhole/bh_hal.cpp index 1096da8ec651..5c6513a264ad 100644 --- a/tt_metal/llrt/blackhole/bh_hal.cpp +++ b/tt_metal/llrt/blackhole/bh_hal.cpp @@ -77,7 +77,10 @@ void Hal::initialize_bh() { return NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end); }; - num_nocs_ = NUM_NOCS; + this->num_nocs_ = NUM_NOCS; + this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; + this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; + this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; } } // namespace tt_metal diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp index 71a889179b8d..03bb4e0c84e0 100644 --- a/tt_metal/llrt/grayskull/gs_hal.cpp +++ b/tt_metal/llrt/grayskull/gs_hal.cpp @@ -161,7 +161,10 @@ void Hal::initialize_gs() { return NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end); }; - num_nocs_ = NUM_NOCS; + this->num_nocs_ = NUM_NOCS; + this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; + this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; + this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; } } // namespace tt_metal diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/llrt/hal.hpp index 80e880026961..9344b6bd4ac5 100644 --- a/tt_metal/llrt/hal.hpp +++ b/tt_metal/llrt/hal.hpp @@ -149,6 +149,9 @@ class Hal { std::vector dram_sizes_; std::vector mem_alignments_; uint32_t num_nocs_; + bool coordinate_virtualization_enabled_; + uint32_t virtual_worker_start_x_; + uint32_t virtual_worker_start_y_; void initialize_gs(); void initialize_wh(); @@ -178,6 +181,9 @@ class Hal { return noc_multicast_encoding_func_(x_start, y_start, x_end, y_end); } + bool is_coordinate_virtualization_enabled() const { return this->coordinate_virtualization_enabled_; }; + std::uint32_t get_virtual_worker_start_x() const { return this->virtual_worker_start_x_; } + std::uint32_t get_virtual_worker_start_y() const { return this->virtual_worker_start_y_; } uint32_t get_programmable_core_type_count() const; HalProgrammableCoreType get_programmable_core_type(uint32_t core_type_index) const; uint32_t get_programmable_core_type_index(HalProgrammableCoreType programmable_core_type_index) const; diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index 888b40fe7442..ccea2588f93f 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -105,9 +105,8 @@ std::vector read_hex_vec_from_core(chip_id_t chip, const CoreCoord &co return read_hex_vec; } -CoreCoord logical_core_from_ethernet_core(chip_id_t chip_id, const CoreCoord &physical_core) { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip_id); - return soc_desc.get_logical_ethernet_core_from_physical(physical_core); +CoreCoord logical_core_from_ethernet_core(chip_id_t chip_id, const CoreCoord ðernet_core) { + return tt::Cluster::instance().get_logical_ethernet_core_from_virtual(chip_id, ethernet_core); } void write_launch_msg_to_core(chip_id_t chip, const CoreCoord core, launch_msg_t *msg, go_msg_t *go_msg, uint64_t base_addr, bool send_go) { @@ -177,7 +176,7 @@ uint32_t generate_risc_startup_addr(bool is_eth_core) { void program_risc_startup_addr(chip_id_t chip_id, const CoreCoord &core) { std::vector jump_to_fw; - jump_to_fw.push_back(generate_risc_startup_addr(is_ethernet_core(core, chip_id))); + jump_to_fw.push_back(generate_risc_startup_addr(tt::Cluster::instance().is_ethernet_core(core, chip_id))); write_hex_vec_to_core(chip_id, core, tt::stl::Span(jump_to_fw.data(), jump_to_fw.size()), 0); } @@ -188,7 +187,7 @@ bool test_load_write_read_risc_binary( uint32_t core_type_idx, uint32_t processor_class_idx, uint32_t processor_type_idx) { - assert(is_worker_core(core, chip_id) or is_ethernet_core(core, chip_id)); + assert(tt::Cluster::instance().is_worker_core(core, chip_id) or tt::Cluster::instance().is_ethernet_core(core, chip_id)); uint64_t local_init_addr = tt::tt_metal::hal.get_binary_local_init_addr(core_type_idx, processor_class_idx, processor_type_idx); @@ -225,7 +224,7 @@ CoreCoord get_core_for_dram_channel(int dram_channel_id, chip_id_t chip_id) { namespace internal_ { static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreCoord &core, int run_state) { - bool is_eth_core = is_ethernet_core(core, chip_id); + bool is_eth_core = tt::Cluster::instance().is_ethernet_core(core, chip_id); bool is_active_eth_core = false; bool is_inactive_eth_core = false; diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/llrt/llrt.hpp index 260d9793baf6..0bf814e58696 100644 --- a/tt_metal/llrt/llrt.hpp +++ b/tt_metal/llrt/llrt.hpp @@ -87,7 +87,7 @@ void write_hex_vec_to_core( std::vector read_hex_vec_from_core(chip_id_t chip, const CoreCoord &core, uint64_t addr, uint32_t size); -CoreCoord logical_core_from_ethernet_core(chip_id_t chip_id, CoreCoord &physical_core); +CoreCoord logical_core_from_ethernet_core(chip_id_t chip_id, CoreCoord ðernet_core); void write_launch_msg_to_core(chip_id_t chip, CoreCoord core, launch_msg_t *msg, go_msg_t * go_msg, uint64_t addr, bool send_go = true); @@ -95,18 +95,6 @@ void launch_erisc_app_fw_on_core(chip_id_t chip, CoreCoord core); void print_worker_cores(chip_id_t chip_id = 0); -inline bool is_worker_core(const CoreCoord &core, chip_id_t chip_id) { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip_id); - return std::find(soc_desc.physical_workers.begin(), soc_desc.physical_workers.end(), core) != - soc_desc.physical_workers.end(); -} - -inline bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip_id); - return std::find(soc_desc.physical_ethernet_cores.begin(), soc_desc.physical_ethernet_cores.end(), core) != - soc_desc.physical_ethernet_cores.end(); -} - uint32_t generate_risc_startup_addr(bool is_eth_core); void program_risc_startup_addr(chip_id_t chip_id, const CoreCoord &core); diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index d49d2ae4d5e4..e26404d9457c 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -179,6 +179,9 @@ void Cluster::initialize_device_drivers() { tt_device_params default_params; this->start_driver(default_params); + this->generate_virtual_to_umd_coord_mapping(); + this->generate_logical_to_virtual_coord_mapping(); + this->generate_virtual_to_profiler_flat_id_mapping(); } void Cluster::assert_risc_reset() { @@ -211,6 +214,10 @@ void Cluster::get_metal_desc_from_tt_desc( } } +const std::unordered_map& Cluster::get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const { + return this->virtual_routing_to_profiler_flat_id_.at(this->get_board_type(chip_id)); +} + void Cluster::open_driver(const bool &skip_driver_allocs) { const std::string sdesc_path = get_soc_description_file(this->arch_, this->target_type_); @@ -308,6 +315,132 @@ const metal_SocDescriptor &Cluster::get_soc_desc(chip_id_t chip) const { return this->sdesc_per_chip_.at(chip); } +void Cluster::generate_virtual_to_umd_coord_mapping() { + // UMD APIs currently use a coordinate system that is not Physical, Virtual or Logical. + // TT-Metal uses Virtual Coordinates when programming txns on device. + // This mapping allows Cluster APIs to be consistent with the rest of TT-Metal, while correctly + // using UMD under the hood. + // This will be kept around until UMD supports generic coordinates in its APIs, at which point TT-Metal + // virtual coordinates can be passed to UMD directly. + for (auto chip_id : this->cluster_desc_->get_all_chips()) { + this->virtual_worker_cores_[chip_id] = {}; + this->virtual_eth_cores_[chip_id] = {}; + for (auto& core_desc : this->get_soc_desc(chip_id).physical_cores) { + if (core_desc.second.type != CoreType::HARVESTED) { + CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, core_desc.first, core_desc.second.type); + tt_cxy_pair virtual_core = tt_cxy_pair(chip_id, virtual_coords.x, virtual_coords.y); + tt_cxy_pair umd_core = this->get_soc_desc(chip_id).convert_to_umd_coordinates(tt_cxy_pair(chip_id, core_desc.first.x, core_desc.first.y)); + this->virtual_to_umd_coord_mapping_[virtual_core] = umd_core; + if (core_desc.second.type == CoreType::WORKER) { + this->virtual_worker_cores_[chip_id].insert(virtual_coords); + } else if (core_desc.second.type == CoreType::ETH) { + this->virtual_eth_cores_[chip_id].insert(virtual_coords); + } + } + } + } +} + +void Cluster::generate_logical_to_virtual_coord_mapping() { + for (auto chip_id : this->cluster_desc_->get_all_chips()) { + auto board_type = this->get_board_type(chip_id); + if (this->worker_logical_to_virtual_x_.find(board_type) != this->worker_logical_to_virtual_x_.end()) { + continue; + } + auto& soc_desc = this->get_soc_desc(chip_id); + this->worker_logical_to_virtual_x_.insert({board_type, {}}); + this->worker_logical_to_virtual_y_.insert({board_type, {}}); + this->eth_logical_to_virtual_.insert({board_type, {}}); + for (auto x_coords : soc_desc.worker_log_to_routing_x) { + CoreCoord phys_core = soc_desc.get_physical_core_from_logical_core(CoreCoord(x_coords.first, 0), CoreType::WORKER); + CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, phys_core, CoreType::WORKER); + this->worker_logical_to_virtual_x_.at(board_type).insert({x_coords.first, virtual_coords.x}); + } + for (auto y_coords : soc_desc.worker_log_to_routing_y) { + CoreCoord phys_core = soc_desc.get_physical_core_from_logical_core(CoreCoord(0, y_coords.first), CoreType::WORKER); + CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, phys_core, CoreType::WORKER); + this->worker_logical_to_virtual_y_.at(board_type).insert({y_coords.first, virtual_coords.y}); + } + for (std::size_t log_eth_core_y = 0; log_eth_core_y < soc_desc.physical_ethernet_cores.size(); log_eth_core_y++) { + CoreCoord logical_eth_core = {0, log_eth_core_y}; + CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, soc_desc.physical_ethernet_cores.at(log_eth_core_y), CoreType::ETH); + this->eth_logical_to_virtual_.at(board_type).insert({logical_eth_core, virtual_coords}); + } + } + +} + +void Cluster::generate_virtual_to_profiler_flat_id_mapping() { +#if defined(TRACY_ENABLE) + for (auto chip_id : this->cluster_desc_->get_all_chips()) { + auto board_type = this->get_board_type(chip_id); + if (this->virtual_routing_to_profiler_flat_id_.find(board_type) != this->virtual_routing_to_profiler_flat_id_.end()) { + continue; + } + this->virtual_routing_to_profiler_flat_id_.insert({board_type, {}}); + auto& soc_desc = this->get_soc_desc(chip_id); + for (const auto& core_to_profiler_id : soc_desc.physical_routing_to_profiler_flat_id) { + if (std::find(soc_desc.physical_workers.begin(), soc_desc.physical_workers.end(), core_to_profiler_id.first) != soc_desc.physical_workers.end()) { + this->virtual_routing_to_profiler_flat_id_.at(board_type).insert({this->get_virtual_coordinate_from_physical_coordinates(chip_id, core_to_profiler_id.first, CoreType::WORKER), core_to_profiler_id.second}); + } else { + this->virtual_routing_to_profiler_flat_id_.at(board_type).insert({this->get_virtual_coordinate_from_physical_coordinates(chip_id, core_to_profiler_id.first, CoreType::ETH), core_to_profiler_id.second}); + } + } + } +#endif +} + +bool Cluster::is_worker_core(const CoreCoord &core, chip_id_t chip_id) const { + return this->virtual_worker_cores_.at(chip_id).find(core) != this->virtual_worker_cores_.at(chip_id).end(); +} + +bool Cluster::is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) const { + + return this->virtual_eth_cores_.find(chip_id) != this->virtual_eth_cores_.end() and + this->virtual_eth_cores_.at(chip_id).find(core) != this->virtual_eth_cores_.at(chip_id).end(); +} + +const std::unordered_set& Cluster::get_virtual_worker_cores(chip_id_t chip_id) const { + return this->virtual_worker_cores_.at(chip_id); +} + +const std::unordered_set& Cluster::get_virtual_eth_cores(chip_id_t chip_id) const { + return this->virtual_eth_cores_.at(chip_id); +} + +CoreCoord Cluster::get_virtual_coordinate_from_logical_coordinates(chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const { + auto board_type = this->get_board_type(chip_id); + if (core_type == CoreType::WORKER) { + return CoreCoord(this->worker_logical_to_virtual_x_.at(board_type).at(logical_coord.x), this->worker_logical_to_virtual_y_.at(board_type).at(logical_coord.y)); + } else if (core_type == CoreType::ETH) { + return this->eth_logical_to_virtual_.at(board_type).at(logical_coord); + } + auto& soc_desc = this->get_soc_desc(chip_id); + return soc_desc.get_physical_core_from_logical_core(logical_coord, core_type); +} + +tt_cxy_pair Cluster::get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair logical_coordinate, const CoreType& core_type) const { + auto xy_virtual_coord = this->get_virtual_coordinate_from_logical_coordinates(logical_coordinate.chip, CoreCoord(logical_coordinate.x, logical_coordinate.y), core_type); + return tt_cxy_pair(logical_coordinate.chip, xy_virtual_coord); +} +CoreCoord Cluster::get_virtual_coordinate_from_physical_coordinates(chip_id_t chip_id, CoreCoord physical_coord, const CoreType& core_type) const { + auto& soc_desc = this->get_soc_desc(chip_id); + if (not (core_type == CoreType::WORKER or core_type == CoreType::ETH)) { + return physical_coord; + } + tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(tt_cxy_pair(chip_id, physical_coord.x, physical_coord.y)); + std::size_t c = virtual_chip_coord.x; + std::size_t r = virtual_chip_coord.y; + this->driver_->translate_to_noc_table_coords(chip_id, r, c); + return CoreCoord{c, r}; +} + +CoreCoord Cluster::get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const { + const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip); + auto phys_eth_core = this->virtual_to_umd_coord_mapping_.at(tt_cxy_pair(chip, core.x, core.y)); + return soc_desc.get_logical_ethernet_core_from_physical(phys_eth_core); +} + uint32_t Cluster::get_harvested_rows(chip_id_t chip) const { if (this->target_type_ == TargetDevice::Simulator) { return 0; @@ -332,16 +465,16 @@ int Cluster::get_device_aiclk(const chip_id_t &chip_id) const { return 0; } -void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const { - const metal_SocDescriptor &soc_desc = this->get_soc_desc(physical_chip_coord.chip); - tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(physical_chip_coord); - this->driver_->deassert_risc_reset_at_core(virtual_chip_coord); +void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair &core) const { + const metal_SocDescriptor &soc_desc = this->get_soc_desc(core.chip); + tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + this->driver_->deassert_risc_reset_at_core(umd_core); } -void Cluster::assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const { - const metal_SocDescriptor &soc_desc = this->get_soc_desc(physical_chip_coord.chip); - tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(physical_chip_coord); - this->driver_->assert_risc_reset_at_core(virtual_chip_coord); +void Cluster::assert_risc_reset_at_core(const tt_cxy_pair &core) const { + const metal_SocDescriptor &soc_desc = this->get_soc_desc(core.chip); + tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + this->driver_->assert_risc_reset_at_core(umd_core); } void Cluster::write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access) const { @@ -384,10 +517,11 @@ void Cluster::write_core( chip_id_t chip_id = core.chip; const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); if (tt::llrt::OptionsG.get_watcher_enabled()) { - tt::watcher_sanitize_host_noc_write(soc_desc, {core.x, core.y}, addr, sz_in_bytes); + tt::watcher_sanitize_host_noc_write(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {core.x, core.y}, addr, sz_in_bytes); } - tt_cxy_pair virtual_core = soc_desc.convert_to_umd_coordinates(core); - this->driver_->write_to_device(mem_ptr, sz_in_bytes, virtual_core, addr, "LARGE_WRITE_TLB"); + + tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + this->driver_->write_to_device(mem_ptr, sz_in_bytes, umd_core, addr, "LARGE_WRITE_TLB"); if (this->cluster_desc_->is_chip_remote(chip_id)) { this->driver_->wait_for_non_mmio_flush(chip_id); } @@ -399,11 +533,11 @@ void Cluster::read_core( const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); if (tt::llrt::OptionsG.get_watcher_enabled()) { - tt::watcher_sanitize_host_noc_read(soc_desc, {core.x, core.y}, addr, size_in_bytes); + tt::watcher_sanitize_host_noc_read(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {core.x, core.y}, addr, size_in_bytes); } - tt_cxy_pair virtual_core = soc_desc.convert_to_umd_coordinates(core); - this->driver_->read_from_device(mem_ptr, virtual_core, addr, size_in_bytes, "LARGE_READ_TLB"); + tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + this->driver_->read_from_device(mem_ptr, umd_core, addr, size_in_bytes, "LARGE_READ_TLB"); } void Cluster::read_core( @@ -418,10 +552,10 @@ void Cluster::write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64 const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); if (tt::llrt::OptionsG.get_watcher_enabled()) { - tt::watcher_sanitize_host_noc_write(soc_desc, {target.x, target.y}, addr, size_in_bytes); + tt::watcher_sanitize_host_noc_write(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {target.x, target.y}, addr, size_in_bytes); } - tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target); - this->driver_->write_to_device(mem_ptr, size_in_bytes, virtual_target, addr, "REG_TLB"); + tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); + this->driver_->write_to_device(mem_ptr, size_in_bytes, umd_target, addr, "REG_TLB"); if (this->cluster_desc_->is_chip_remote(chip_id)) { this->driver_->wait_for_non_mmio_flush(chip_id); } @@ -433,10 +567,10 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); if (tt::llrt::OptionsG.get_watcher_enabled()) { - tt::watcher_sanitize_host_noc_read(soc_desc, {target.x, target.y}, addr, size_in_bytes); + tt::watcher_sanitize_host_noc_read(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {target.x, target.y}, addr, size_in_bytes); } - tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target); - this->driver_->read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB"); + tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); + this->driver_->read_from_device(mem_ptr, umd_target, addr, size_in_bytes, "REG_TLB"); } void Cluster::write_sysmem( @@ -689,7 +823,6 @@ void Cluster::reserve_ethernet_cores_for_tunneling() { // only setup fd tunneling for devices grouped with same mmio device and if no bi dir // tunnel found between the two chips and if link distance between both chips to mmio // chip is not the same - tt_cxy_pair(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); log_debug( LogDevice, "Reserving {} for tunneling", @@ -863,18 +996,18 @@ void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_ }; for (const auto &chip_id : non_mmio_devices) { for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) { - tt_cxy_pair eth_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH)); // Enable internal ethernet routing for non-mmio devices write_core( - (void *)&routing_info_enabled, sizeof(routing_info_t), eth_phys_core, routing_info_addr, false); + (void *)&routing_info_enabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); } } for (const auto &chip_id : mmio_devices) { for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) { - tt_cxy_pair eth_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH)); // Enable internal ethernet routing for mmio devices write_core( - (void *)&routing_info_enabled, sizeof(routing_info_t), eth_phys_core, routing_info_addr, false); + (void *)&routing_info_enabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); } } } else { @@ -885,18 +1018,18 @@ void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_ }; for (const auto &chip_id : mmio_devices) { for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) { - tt_cxy_pair eth_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH)); // Disable internal ethernet routing for mmio devices write_core( - (void *)&routing_info_disabled, sizeof(routing_info_t), eth_phys_core, routing_info_addr, false); + (void *)&routing_info_disabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); } } for (const auto &chip_id : non_mmio_devices) { for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) { - tt_cxy_pair eth_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH)); // Disable internal ethernet routing for non-mmio devices write_core( - (void *)&routing_info_disabled, sizeof(routing_info_t), eth_phys_core, routing_info_addr, false); + (void *)&routing_info_disabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); } } } diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 4d45aeba0d26..3b59f4cc7d06 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -61,6 +61,12 @@ class Cluster { ARCH arch() const { return this->arch_; } const metal_SocDescriptor &get_soc_desc(chip_id_t chip) const; + CoreCoord get_virtual_coordinate_from_logical_coordinates(chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const; + CoreCoord get_virtual_coordinate_from_physical_coordinates(chip_id_t chip_id, CoreCoord physical_coord, const CoreType& core_type) const; + tt_cxy_pair get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair logical_coordinate, const CoreType& core_type) const; + const std::unordered_set& get_virtual_worker_cores(chip_id_t chip_id) const; + const std::unordered_set& get_virtual_eth_cores(chip_id_t chip_id) const; + uint32_t get_harvested_rows(chip_id_t chip) const; uint32_t get_harvesting_mask(chip_id_t chip) const { return this->driver_->get_harvesting_masks_for_soc_descriptors().at(chip); @@ -90,9 +96,8 @@ class Cluster { std::optional> get_tlb_data(const tt_cxy_pair &target) const { tt::umd::Cluster *device = dynamic_cast(driver_.get()); - const metal_SocDescriptor &soc_desc = this->get_soc_desc(target.chip); - tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(target); - return device->get_tlb_data_from_target(virtual_chip_coord); + tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); + return device->get_tlb_data_from_target(umd_target); } std::function get_fast_pcie_static_tlb_write_callable( @@ -106,9 +111,8 @@ class Cluster { // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack traversals tt::Writer get_static_tlb_writer(tt_cxy_pair target) const { tt::umd::Cluster *device = dynamic_cast(driver_.get()); - const metal_SocDescriptor &soc_desc = this->get_soc_desc(target.chip); - tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target); - return device->get_static_tlb_writer(virtual_target); + tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); + return device->get_static_tlb_writer(umd_target); } std::uint32_t get_numa_node_for_device(uint32_t device_id) const { @@ -210,6 +214,12 @@ class Cluster { // Returns Wormhole chip board type. BoardType get_board_type(chip_id_t chip_id) const; + bool is_worker_core(const CoreCoord &core, chip_id_t chip_id) const; + bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) const; + CoreCoord get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const; + const std::unordered_map& get_worker_logical_to_virtual_x(chip_id_t chip_id) const { return this->worker_logical_to_virtual_x_.at(this->get_board_type(chip_id)); }; + const std::unordered_map& get_worker_logical_to_virtual_y(chip_id_t chip_id) const { return this->worker_logical_to_virtual_y_.at(this->get_board_type(chip_id)); }; + const std::unordered_map& get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const; private: Cluster(); ~Cluster(); @@ -226,7 +236,9 @@ class Cluster { void get_metal_desc_from_tt_desc( const std::unordered_map &input, const std::unordered_map &per_chip_id_harvesting_masks); - tt_cxy_pair convert_physical_cxy_to_virtual(const tt_cxy_pair &physical_cxy) const; + void generate_virtual_to_umd_coord_mapping(); + void generate_logical_to_virtual_coord_mapping(); + void generate_virtual_to_profiler_flat_id_mapping(); // Reserves ethernet cores in cluster for tunneling void reserve_ethernet_cores_for_tunneling(); @@ -256,7 +268,14 @@ class Cluster { std::unordered_map> devices_grouped_by_assoc_mmio_device_; // Save mapping of device id to associated MMIO device id for fast lookup std::unordered_map device_to_mmio_device_; - + // Data Structures Tracking Virtual Coordinates + std::unordered_map virtual_to_umd_coord_mapping_; + std::unordered_map> virtual_worker_cores_; + std::unordered_map> virtual_eth_cores_; + std::unordered_map> worker_logical_to_virtual_x_; + std::unordered_map> worker_logical_to_virtual_y_; + std::unordered_map> eth_logical_to_virtual_; + std::unordered_map> virtual_routing_to_profiler_flat_id_; // Flag to tell whether we are on a TG type of system. // If any device has to board type of GALAXY, we are on a TG cluster. bool is_tg_cluster_; diff --git a/tt_metal/llrt/wormhole/wh_hal.cpp b/tt_metal/llrt/wormhole/wh_hal.cpp index 4f20cfb9993e..a82bad6c6b3e 100644 --- a/tt_metal/llrt/wormhole/wh_hal.cpp +++ b/tt_metal/llrt/wormhole/wh_hal.cpp @@ -78,7 +78,10 @@ void Hal::initialize_wh() { return NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end); }; - num_nocs_ = NUM_NOCS; + this->num_nocs_ = NUM_NOCS; + this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; + this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; + this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; } } // namespace tt_metal diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp index 81e19e45f60b..66a83b7b9b87 100644 --- a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp @@ -29,15 +29,10 @@ int main(int argc, char** argv) { std::shared_ptr src1_dram_buffer = CreateBuffer(dram_config); std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); - auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates(); - auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates(); - auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates(); - uint32_t src0_dram_noc_x = src0_dram_noc_coord.x; - uint32_t src0_dram_noc_y = src0_dram_noc_coord.y; - uint32_t src1_dram_noc_x = src1_dram_noc_coord.x; - uint32_t src1_dram_noc_y = src1_dram_noc_coord.y; - uint32_t dst_dram_noc_x = dst_dram_noc_coord.x; - uint32_t dst_dram_noc_y = dst_dram_noc_coord.y; + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + uint32_t src0_bank_id = 0; + uint32_t src1_bank_id = 0; + uint32_t dst_bank_id = 0; /* Use L1 circular buffers to set input and output buffers that the compute engine will use */ constexpr uint32_t src0_cb_index = CBIndex::c_0; @@ -102,14 +97,9 @@ int main(int argc, char** argv) { program, binary_reader_kernel_id, core, - {src0_dram_buffer->address(), - src1_dram_buffer->address(), - src0_dram_noc_x, - src0_dram_noc_y, - src1_dram_noc_x, - src1_dram_noc_y}); + {src0_dram_buffer->address(), src1_dram_buffer->address(), src0_bank_id, src1_bank_id}); SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {}); - SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_dram_noc_x, dst_dram_noc_y}); + SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id}); EnqueueProgram(cq, program, false); Finish(cq); diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp index 3b9a10aba37c..4f4fd8abe952 100644 --- a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp @@ -8,13 +8,11 @@ void kernel_main() { uint32_t src0_addr = get_arg_val(0); uint32_t src1_addr = get_arg_val(1); - uint32_t src0_dram_noc_x = get_arg_val(2); - uint32_t src0_dram_noc_y = get_arg_val(3); - uint32_t src1_dram_noc_x = get_arg_val(4); - uint32_t src1_dram_noc_y = get_arg_val(5); + uint32_t src0_bank_id = get_arg_val(2); + uint32_t src1_bank_id = get_arg_val(3); - uint64_t src0_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp index 89aea6f18584..9bd9b460661d 100644 --- a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp @@ -6,10 +6,9 @@ void kernel_main() { uint32_t dst_addr = get_arg_val(0); - uint32_t dst_dram_noc_x = get_arg_val(1); - uint32_t dst_dram_noc_y = get_arg_val(2); + uint32_t dst_bank_id = get_arg_val(1); - uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_addr); constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16; uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); diff --git a/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp b/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp index a91e89e83d37..6716eed1d123 100644 --- a/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp @@ -25,15 +25,10 @@ int main(int argc, char** argv) { std::shared_ptr src1_dram_buffer = CreateBuffer(dram_config); std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); - auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates(); - auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates(); - auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates(); - uint32_t src0_dram_noc_x = src0_dram_noc_coord.x; - uint32_t src0_dram_noc_y = src0_dram_noc_coord.y; - uint32_t src1_dram_noc_x = src1_dram_noc_coord.x; - uint32_t src1_dram_noc_y = src1_dram_noc_coord.y; - uint32_t dst_dram_noc_x = dst_dram_noc_coord.x; - uint32_t dst_dram_noc_y = dst_dram_noc_coord.y; + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + uint32_t src0_bank_id = 0; + uint32_t src1_bank_id = 0; + uint32_t dst_bank_id = 0; /* Create source data and write to DRAM */ std::vector src0_vec(1, 14); @@ -67,17 +62,12 @@ int main(int argc, char** argv) { program, binary_reader_kernel_id, core, - { - src0_dram_buffer->address(), - src1_dram_buffer->address(), - dst_dram_buffer->address(), - src0_dram_noc_x, - src0_dram_noc_y, - src1_dram_noc_x, - src1_dram_noc_y, - dst_dram_noc_x, - dst_dram_noc_y, - }); + {src0_dram_buffer->address(), + src1_dram_buffer->address(), + dst_dram_buffer->address(), + src0_bank_id, + src1_bank_id, + dst_bank_id}); EnqueueProgram(cq, program, false); Finish(cq); diff --git a/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp b/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp index 61bf0ce554d4..8b7947af9052 100644 --- a/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp @@ -6,17 +6,14 @@ void kernel_main() { uint32_t src0_dram = get_arg_val(0); uint32_t src1_dram = get_arg_val(1); uint32_t dst_dram = get_arg_val(2); - uint32_t src0_dram_noc_x = get_arg_val(3); - uint32_t src0_dram_noc_y = get_arg_val(4); - uint32_t src1_dram_noc_x = get_arg_val(5); - uint32_t src1_dram_noc_y = get_arg_val(6); - uint32_t dst_dram_noc_x = get_arg_val(7); - uint32_t dst_dram_noc_y = get_arg_val(8); + uint32_t src0_bank_id = get_arg_val(3); + uint32_t src1_bank_id = get_arg_val(4); + uint32_t dst_bank_id = get_arg_val(5); // NoC coords (x,y) depending on DRAM location on-chip - uint64_t src0_dram_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src0_dram); - uint64_t src1_dram_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_dram); - uint64_t dst_dram_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_dram); + uint64_t src0_dram_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_dram); + uint64_t src1_dram_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_dram); + uint64_t dst_dram_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram); constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; // index=0 constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; // index=1 diff --git a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp index 358c816abe90..98154a7c16ac 100644 --- a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp +++ b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp @@ -96,7 +96,10 @@ int main(int argc, char** argv) { std::shared_ptr src0_dram_buffer = CreateBuffer(dram_config); std::shared_ptr src1_dram_buffer = CreateBuffer(dram_config); std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); - + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + uint32_t src0_bank_id = 0; + uint32_t src1_bank_id = 0; + uint32_t dst_bank_id = 0; /* * Use circular buffers to set input and output buffers that the * compute engine will use. @@ -182,25 +185,16 @@ int main(int argc, char** argv) { binary_reader_kernel_id, core, {src0_dram_buffer->address(), - static_cast(src0_dram_buffer->noc_coordinates().x), - static_cast(src0_dram_buffer->noc_coordinates().y), + src0_bank_id, num_tiles, src1_dram_buffer->address(), - static_cast(src1_dram_buffer->noc_coordinates().x), - static_cast(src1_dram_buffer->noc_coordinates().y), + src1_bank_id, num_tiles, 0}); SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {num_tiles, 1}); - SetRuntimeArgs( - program, - unary_writer_kernel_id, - core, - {dst_dram_buffer->address(), - static_cast(dst_dram_buffer->noc_coordinates().x), - static_cast(dst_dram_buffer->noc_coordinates().y), - num_tiles}); + SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id, num_tiles}); EnqueueProgram(cq, program, false); Finish(cq); @@ -268,25 +262,16 @@ int main(int argc, char** argv) { binary_reader_kernel_id, core, {src0_dram_buffer->address(), - static_cast(src0_dram_buffer->noc_coordinates().x), - static_cast(src0_dram_buffer->noc_coordinates().y), + src0_bank_id, num_tiles, src1_dram_buffer->address(), - static_cast(src1_dram_buffer->noc_coordinates().x), - static_cast(src1_dram_buffer->noc_coordinates().y), + src1_bank_id, num_tiles, 0}); SetRuntimeArgs(program_mul, eltwise_binary_kernel_id, core, {num_tiles, 1}); - SetRuntimeArgs( - program_mul, - unary_writer_kernel_id, - core, - {dst_dram_buffer->address(), - static_cast(dst_dram_buffer->noc_coordinates().x), - static_cast(dst_dram_buffer->noc_coordinates().y), - num_tiles}); + SetRuntimeArgs(program_mul, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id, num_tiles}); /* * Execute. diff --git a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp index 70af2275a2ae..d1a4752bcb0b 100644 --- a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp +++ b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp @@ -53,6 +53,9 @@ int main(int argc, char** argv) { std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); const uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + uint32_t src0_bank_id = 0; + uint32_t dst_bank_id = 0; /* * Use circular buffers to set input and output buffers that the @@ -129,19 +132,11 @@ int main(int argc, char** argv) { core, { src0_dram_buffer->address(), - static_cast(src0_dram_buffer->noc_coordinates().x), - static_cast(src0_dram_buffer->noc_coordinates().y), + src0_bank_id, num_tiles, }); - SetRuntimeArgs( - program, - unary_writer_kernel_id, - core, - {dst_dram_buffer->address(), - static_cast(dst_dram_buffer->noc_coordinates().x), - static_cast(dst_dram_buffer->noc_coordinates().y), - num_tiles}); + SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id, num_tiles}); EnqueueProgram(cq, program, false); Finish(cq); diff --git a/tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp b/tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp index a84d23fb46e5..0e364122eb28 100644 --- a/tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp +++ b/tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp @@ -7,21 +7,19 @@ void kernel_main() { std::uint32_t l1_buffer_addr = get_arg_val(0); - std::uint32_t dram_buffer_src_addr = get_arg_val(1); - std::uint32_t dram_src_noc_x = get_arg_val(2); - std::uint32_t dram_src_noc_y = get_arg_val(3); + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t dram_buffer_src_bank = get_arg_val(2); - std::uint32_t dram_buffer_dst_addr = get_arg_val(4); - std::uint32_t dram_dst_noc_x = get_arg_val(5); - std::uint32_t dram_dst_noc_y = get_arg_val(6); + std::uint32_t dram_buffer_dst_addr = get_arg_val(3); + std::uint32_t dram_buffer_dst_bank = get_arg_val(4); - std::uint32_t dram_buffer_size = get_arg_val(7); + std::uint32_t dram_buffer_size = get_arg_val(5); - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_buffer_src_bank, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, dram_buffer_size); noc_async_read_barrier(); - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_buffer_dst_bank, dram_buffer_dst_addr); noc_async_write(l1_buffer_addr, dram_buffer_dst_noc_addr, dram_buffer_size); noc_async_write_barrier(); } diff --git a/tt_metal/programming_examples/loopback/loopback.cpp b/tt_metal/programming_examples/loopback/loopback.cpp index 7ad21566b44a..77f20b22be95 100644 --- a/tt_metal/programming_examples/loopback/loopback.cpp +++ b/tt_metal/programming_examples/loopback/loopback.cpp @@ -66,6 +66,10 @@ int main(int argc, char** argv) { auto output_dram_buffer = CreateBuffer(dram_config); const uint32_t output_dram_buffer_addr = output_dram_buffer->address(); + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + const uint32_t input_bank_id = 0; + const uint32_t output_bank_id = 0; + /* * Create input data and runtime arguments, then execute */ @@ -76,11 +80,9 @@ int main(int argc, char** argv) { const std::vector runtime_args = { l1_buffer->address(), input_dram_buffer->address(), - static_cast(input_dram_buffer->noc_coordinates().x), - static_cast(input_dram_buffer->noc_coordinates().y), + input_bank_id, output_dram_buffer->address(), - static_cast(output_dram_buffer->noc_coordinates().x), - static_cast(output_dram_buffer->noc_coordinates().y), + output_bank_id, l1_buffer->size()}; SetRuntimeArgs(program, dram_copy_kernel_id, core, runtime_args); diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index e6b13e0a798a..6c8b9ea18470 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -34,9 +34,7 @@ void DeviceProfiler::readRiscProfilerResults( int riscCount; profiler_msg_t* profiler_msg; - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device_id); - auto ethCores = soc_d.get_physical_ethernet_cores(); - if (std::find(ethCores.begin(), ethCores.end(), worker_core) == ethCores.end()) { + if (tt::Cluster::instance().is_worker_core(worker_core, device_id)) { profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::PROFILER); CoreType = HalProgrammableCoreType::TENSIX; riscCount = 5; @@ -47,7 +45,7 @@ void DeviceProfiler::readRiscProfilerResults( riscCount = 1; } - uint32_t coreFlatID = soc_d.physical_routing_to_profiler_flat_id.at(worker_core); + uint32_t coreFlatID = tt::Cluster::instance().get_virtual_routing_to_profiler_flat_id(device_id).at(worker_core); uint32_t startIndex = coreFlatID * MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC; std::vector control_buffer = tt::llrt::read_hex_vec_from_core( diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index e90e9caa236d..5c9df9b95265 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -65,12 +65,9 @@ void setControlBuffer(uint32_t device_id, std::vector& control_buffer) control_buffer[kernel_profiler::CORE_COUNT_PER_DRAM] = soc_d.profiler_ceiled_core_count_perf_dram_bank; - auto ethCores = soc_d.get_physical_ethernet_cores(); - for (auto& core : soc_d.physical_routing_to_profiler_flat_id) { + for (auto& core : tt::Cluster::instance().get_virtual_routing_to_profiler_flat_id(device_id)) { profiler_msg_t* profiler_msg; - // TODO: clean this up when HAL is more complete (one lookup w/ type) - if (std::find(ethCores.begin(), ethCores.end(), core.first) == ethCores.end()) { - // Tensix + if (tt::Cluster::instance().is_worker_core(core.first, device_id)) { profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::PROFILER); } else { @@ -334,8 +331,8 @@ void DumpDeviceProfileResults(Device* device, bool lastDump) { workerCores.push_back(curr_core); } for (const CoreCoord& core : device->get_active_ethernet_cores(true)) { - auto physicalCore = device->physical_core_from_logical_core(core, CoreType::ETH); - workerCores.push_back(physicalCore); + auto virtualCore = device->virtual_core_from_logical_core(core, CoreType::ETH); + workerCores.push_back(virtualCore); } device->push_work( [device, workerCores, lastDump]() mutable { DumpDeviceProfileResults(device, workerCores, lastDump); }); @@ -355,10 +352,10 @@ void DumpDeviceProfileResults(Device* device, std::vector& worker_cor auto device_num_hw_cqs = device->num_hw_cqs(); for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs, dispatch_core_config)) { - const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); + const auto curr_core = device->virtual_core_from_logical_core(core, dispatch_core_type); worker_cores.push_back(curr_core); } - for (const CoreCoord& core : tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores) { + for (const CoreCoord& core : tt::Cluster::instance().get_virtual_eth_cores(device_id)) { worker_cores.push_back(core); } } @@ -393,7 +390,7 @@ void DumpDeviceProfileResults(Device* device, std::vector& worker_cor } for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs, dispatch_core_config)) { - const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); + const auto curr_core = device->virtual_core_from_logical_core(core, dispatch_core_type); profiler_msg_t* profiler_msg = device->get_dev_addr(curr_core, HalL1MemAddrType::PROFILER); std::vector control_buffer = tt::llrt::read_hex_vec_from_core( @@ -410,17 +407,16 @@ void DumpDeviceProfileResults(Device* device, std::vector& worker_cor if (waitForDispatch) { continue; } - for (const CoreCoord& core : - tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores) { + for (const CoreCoord& virtual_core : tt::Cluster::instance().get_virtual_eth_cores(device_id)) { profiler_msg_t* profiler_msg = - device->get_dev_addr(core, HalL1MemAddrType::PROFILER); + device->get_dev_addr(virtual_core, HalL1MemAddrType::PROFILER); std::vector control_buffer = tt::llrt::read_hex_vec_from_core( device_id, - core, + virtual_core, reinterpret_cast(profiler_msg->control_vector), kernel_profiler::PROFILER_L1_CONTROL_BUFFER_SIZE); if (control_buffer[kernel_profiler::PROFILER_DONE] == 0) { - unfinishedCore = core; + unfinishedCore = virtual_core; waitForDispatch = true; continue; } diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index e59f14430cd6..c3e1b8b71bc8 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -312,7 +312,7 @@ bool WriteToDeviceL1( std::vector& host_buffer, CoreType core_type) { ZoneScoped; - auto worker_core = device->physical_core_from_logical_core(logical_core, core_type); + auto worker_core = device->virtual_core_from_logical_core(logical_core, core_type); llrt::write_hex_vec_to_core(device->id(), worker_core, host_buffer, address); return true; } @@ -426,17 +426,22 @@ void WriteToDeviceSharded(Buffer& buffer, tt::stl::Span host_buff const auto& buffer_page_mapping = *buffer.get_buffer_page_mapping(); auto total_pages = buffer.num_pages(); + std::vector page; + page.resize(page_size / sizeof(uint32_t)); for (int host_page_id = 0; host_page_id < total_pages; host_page_id++) { auto dev_page_id = buffer_page_mapping.host_page_to_dev_page_mapping_[host_page_id]; auto core = buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_id]]; auto bank_id = device->bank_ids_from_logical_core(buffer.buffer_type(), core)[0]; auto absolute_address = buffer.sharded_page_address(bank_id, dev_page_id); + auto bank_local_address = buffer.bank_local_page_address(bank_id, dev_page_id); auto data_index = host_page_id * page_size; - std::vector page; - page.insert(page.end(), host_buffer.begin() + data_index, host_buffer.begin() + data_index + page_size); - - auto noc_coordinates = buffer.noc_coordinates(bank_id); - llrt::write_hex_vec_to_core(device->id(), noc_coordinates, page, absolute_address); + std::memcpy(page.data(), host_buffer.data() + data_index, page_size); + if (buffer.is_l1()) { + auto core_coordinates = device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_id)); + llrt::write_hex_vec_to_core(device->id(), core_coordinates, page, absolute_address); + } else { + WriteToDeviceDRAMChannel(device, bank_id, bank_local_address, page); + } } } @@ -455,16 +460,22 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Spannum_banks(buffer.buffer_type()); uint32_t bank_index = 0; int data_index = 0; + std::vector page; + page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { auto absolute_address = buffer.page_address(bank_index, page_index); - std::vector page; - page.insert(page.end(), host_buffer.begin() + data_index, host_buffer.begin() + data_index + page_size); + // Get address offset of buffer in bank. Required when writing to DRAM. + auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); + std::memcpy(page.data(), host_buffer.data() + data_index, page_size); switch (buffer.buffer_type()) { case BufferType::DRAM: + WriteToDeviceDRAMChannel(device, bank_index, bank_local_address, page); + break; case BufferType::L1: case BufferType::L1_SMALL: { - auto noc_coordinates = buffer.noc_coordinates(bank_index); - llrt::write_hex_vec_to_core(device->id(), noc_coordinates, page, absolute_address); + auto core_coordinates = + device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); + llrt::write_hex_vec_to_core(device->id(), core_coordinates, page, absolute_address); } break; default: TT_THROW("Unsupported buffer type to write to device!"); } @@ -509,26 +520,30 @@ void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buf size_t host_idx = 0; uint32_t bank_index = 0; - std::vector page; + std::vector page; + page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { auto absolute_address = buffer.page_address(bank_index, page_index); + // Get address offset of buffer in bank. Required when reading from DRAM. + auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); page.clear(); switch (buffer.buffer_type()) { case BufferType::DRAM: case BufferType::TRACE: + ReadFromDeviceDRAMChannel(device, bank_index, bank_local_address, page_size, page); + break; case BufferType::L1: case BufferType::L1_SMALL: { - auto noc_coordinates = buffer.noc_coordinates(bank_index); - page.resize(page_size); - tt::Cluster::instance().read_core( - page.data(), page_size, tt_cxy_pair(device->id(), noc_coordinates), absolute_address); + auto core_coordinates = + device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); + tt::Cluster::instance().read_core(page.data(), page_size, tt_cxy_pair(device->id(), core_coordinates), absolute_address); } break; default: TT_THROW("Unsupported buffer type to read from device!"); } // Copy page into host buffer - std::memcpy(host_buffer + host_idx, page.data(), page.size()); - host_idx += page.size(); + std::memcpy(host_buffer + host_idx, page.data(), page_size); + host_idx += page_size; bank_index = (bank_index + 1) % num_banks; } @@ -543,10 +558,18 @@ void read_pages_to_host_helper( const uint32_t& dev_page_id, const uint32_t& bank_id) { auto absolute_address = dev_buffer.sharded_page_address(bank_id, dev_page_id); - auto noc_coordinates = dev_buffer.noc_coordinates(bank_id); uint32_t host_buffer_start = host_page_id * page_size; - tt::Cluster::instance().read_core( - host_buffer + host_buffer_start, page_size, tt_cxy_pair(device->id(), noc_coordinates), absolute_address); + if (dev_buffer.is_l1()) { + auto core_coordinates = device->worker_core_from_logical_core(dev_buffer.logical_core_from_bank_id(bank_id)); + tt::Cluster::instance().read_core( + host_buffer + host_buffer_start, page_size, tt_cxy_pair(device->id(), core_coordinates), absolute_address); + } else { + std::vector page; + page.resize(page_size / sizeof(uint32_t)); + auto bank_local_address = dev_buffer.bank_local_page_address(bank_id, dev_page_id); + ReadFromDeviceDRAMChannel(device, bank_id, bank_local_address, page_size, page); + std::memcpy(host_buffer + host_buffer_start, page.data(), page_size); + } } void ReadFromDeviceSharded(Buffer& buffer, uint8_t* host_buffer, bool shard_order) { @@ -669,7 +692,7 @@ void LaunchProgram(Device* device, Program& program, bool wait_until_cores_done) go_msg_t* go_msg = &program.kernels_on_core(logical_core, programmable_core_type_index)->go_msg; msg->kernel_config.host_assigned_id = program.get_runtime_id(); - auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); + auto physical_core = device->virtual_core_from_logical_core(logical_core, core_type); not_done_cores.insert(physical_core); tt::llrt::write_launch_msg_to_core( device->id(), @@ -697,7 +720,7 @@ void WaitProgramDone(Device* device, Program& program) { const auto& logical_cores = logical_cores_used_in_program[index]; CoreType core_type = hal.get_core_type(index); for (const auto& logical_core : logical_cores) { - auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); + auto physical_core = device->virtual_core_from_logical_core(logical_core, core_type); not_done_cores.insert(physical_core); } } @@ -726,8 +749,7 @@ bool ConfigureDeviceWithProgram(Device* device, Program& program, bool fd_bootlo CoreType core_type = hal.get_core_type(index); for (const auto& logical_core : logical_cores) { KernelGroup* kernel_group = program.kernels_on_core(logical_core, index); - CoreCoord physical_core = device->physical_core_from_logical_core(logical_core, core_type); - + CoreCoord physical_core = device->virtual_core_from_logical_core(logical_core, core_type); ConfigureKernelGroup(program, index, kernel_group, device, logical_core); // TODO: add support for CB for ethernet cores if (core_type == CoreType::WORKER) { @@ -785,7 +807,7 @@ void WriteRuntimeArgsToDevice(Device* device, Program& program) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord logical_core(x, y); - auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); + auto physical_core = device->virtual_core_from_logical_core(logical_core, core_type); for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { auto& optional_id = kg.kernel_ids[dispatch_class]; if (optional_id) { diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp index d9fabe0fe58c..92edce9eff77 100644 --- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp @@ -66,16 +66,16 @@ static std::tuple, std::array, std::array< erisc_semaphore_address, start_semaphore_address, erisc_buffer_address, - static_cast(device->physical_core_from_logical_core(sem_init_core, CoreType::WORKER).x), - static_cast(device->physical_core_from_logical_core(sem_init_core, CoreType::WORKER).y), + static_cast(device->virtual_core_from_logical_core(sem_init_core, CoreType::WORKER).x), + static_cast(device->virtual_core_from_logical_core(sem_init_core, CoreType::WORKER).y), worker_sem0}; const std::array sender_rt_args = { static_cast(is_starting_core ? 1 : 0), // is_ring_start eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, // handshake_addr erisc_buffer_address, erisc_semaphore_address, - static_cast(device->physical_core_from_logical_core(sem_init_core, CoreType::WORKER).x), - static_cast(device->physical_core_from_logical_core(sem_init_core, CoreType::WORKER).y), + static_cast(device->virtual_core_from_logical_core(sem_init_core, CoreType::WORKER).x), + static_cast(device->virtual_core_from_logical_core(sem_init_core, CoreType::WORKER).y), worker_sem1}; // sample size const std::array sem_id_args = { worker_sem0, diff --git a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp index c2b142e9b1c9..ed5ba80cb770 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp @@ -95,11 +95,11 @@ static std::pair, std::vector> shard_noc_cores_f std::vector logical_to_noc_row_map; std::vector logical_to_noc_col_map; for (uint32_t y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord noc_core = d->physical_core_from_logical_core(CoreCoord(0, y), CoreType::WORKER); + CoreCoord noc_core = d->virtual_core_from_logical_core(CoreCoord(0, y), CoreType::WORKER); logical_to_noc_row_map.push_back(noc_core.y); } for (uint32_t x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { - CoreCoord noc_core = d->physical_core_from_logical_core(CoreCoord(x, 0), CoreType::WORKER); + CoreCoord noc_core = d->virtual_core_from_logical_core(CoreCoord(x, 0), CoreType::WORKER); logical_to_noc_col_map.push_back(noc_core.x); } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index 7c0544a8c692..83a9c06ab0ad 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -720,9 +720,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // act uint32_t act_dram_addr = src0_dram_buffer->address(); - auto act_dram_noc_xy = src0_dram_buffer->noc_coordinates(); - uint32_t act_noc_x = act_dram_noc_xy.x; - uint32_t act_noc_y = act_dram_noc_xy.y; assert(act_matrix_width_ntiles % act_block_w_ntiles == 0); assert(act_block_h_ntiles % out_subblock_h_ntiles == 0); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp index 41f00c99ff08..9e6b7d9130e1 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp @@ -407,9 +407,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl( // act uint32_t act_dram_addr = src0_dram_buffer->address(); - auto act_dram_noc_xy = src0_dram_buffer->noc_coordinates(); - uint32_t act_noc_x = act_dram_noc_xy.x; - uint32_t act_noc_y = act_dram_noc_xy.y; TT_FATAL( act_block_h_ntiles % out_subblock_h_ntiles == 0, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp index 174f71e22b77..3b4c6c306166 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp @@ -7,6 +7,7 @@ void kernel_main() { constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + constexpr bool read_from_dram = get_compile_time_arg_val(1); const uint32_t total_num_sticks = get_arg_val(0); const uint32_t local_stride_bytes = get_arg_val(1); @@ -25,10 +26,9 @@ void kernel_main() { uint32_t write_offset = args[args_idx++]; uint32_t l1_write_addr = base_write_addr + write_offset; - uint32_t x_coord = args[args_idx++]; - uint32_t y_coord = args[args_idx++]; + uint32_t bank_id = args[args_idx++]; uint32_t read_offset = base_read_addr + args[args_idx++]; - uint64_t noc_read_addr = get_noc_addr(x_coord, y_coord, read_offset); + uint64_t noc_read_addr = get_noc_addr_from_bank_id(bank_id, read_offset); for (uint32_t j = 0; j < total_num_sticks; ++j) { noc_async_read(noc_read_addr, l1_write_addr, read_size); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp index a52753903a59..ee573927dba8 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp @@ -7,6 +7,7 @@ void kernel_main() { constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + constexpr bool write_to_dram = get_compile_time_arg_val(1); const uint32_t total_num_sticks = get_arg_val(0); const uint32_t local_stride_bytes = get_arg_val(1); @@ -25,10 +26,9 @@ void kernel_main() { uint32_t read_offset = args[args_idx++]; uint32_t l1_read_addr = base_l1_read_addr + read_offset; - uint32_t x_coord = args[args_idx++]; - uint32_t y_coord = args[args_idx++]; + uint32_t bank_id = args[args_idx++]; uint32_t write_offset = base_write_addr + args[args_idx++]; - uint64_t noc_write_addr = get_noc_addr(x_coord, y_coord, write_offset); + uint64_t noc_write_addr = get_noc_addr_from_bank_id(bank_id, write_offset); for (uint32_t j = 0; j < total_num_sticks; ++j) { noc_async_write(l1_read_addr, noc_write_addr, write_size); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp index 64871ad90e85..835773e8a0a3 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp @@ -6,7 +6,9 @@ #include "dataflow_api.h" void kernel_main() { - constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + + constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + constexpr bool read_from_dram = get_compile_time_arg_val(1); uint32_t src_addr = get_arg_val(0); uint32_t write_offset = get_arg_val(1); @@ -16,12 +18,10 @@ void kernel_main() { uint32_t l1_write_addr = get_write_ptr(shard_cb_id) + write_offset; for (uint32_t i = 0; i < num_reads; ++i) { - uint32_t x_coord = args[args_idx++]; - uint32_t y_coord = args[args_idx++]; + uint32_t bank_id = args[args_idx++]; uint32_t addr = src_addr + args[args_idx++]; - uint64_t src_noc_addr = get_noc_addr(x_coord, y_coord, addr); uint32_t read_size = args[args_idx++]; - noc_async_read(src_noc_addr, l1_write_addr, read_size); + noc_async_read(get_noc_addr_from_bank_id(bank_id, addr), l1_write_addr, read_size); l1_write_addr += read_size; } noc_async_read_barrier(); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp index 396173cac789..4c1f4f9d59f7 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp @@ -6,7 +6,9 @@ #include "dataflow_api.h" void kernel_main() { - constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + + constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + constexpr bool write_to_dram = get_compile_time_arg_val(1); uint32_t dst_addr = get_arg_val(0); uint32_t read_offset = get_arg_val(1); @@ -16,12 +18,10 @@ void kernel_main() { uint32_t l1_read_addr = get_read_ptr(shard_cb_id) + read_offset; for (uint32_t i = 0; i < num_writes; ++i) { - uint32_t x_coord = args[args_idx++]; - uint32_t y_coord = args[args_idx++]; + uint32_t bank_id = args[args_idx++]; uint32_t addr = dst_addr + args[args_idx++]; - uint64_t dst_noc_addr = get_noc_addr(x_coord, y_coord, addr); uint32_t write_size = args[args_idx++]; - noc_async_write(l1_read_addr, dst_noc_addr, write_size); + noc_async_write(l1_read_addr, get_noc_addr_from_bank_id(bank_id, addr), write_size); l1_read_addr += write_size; } noc_async_write_barrier(); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp index 713e8aebb4e6..8ba24ad224da 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp @@ -341,11 +341,18 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu ? "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp" : "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp"; - tt::tt_metal::KernelHandle kernel_id_0 = - tt::tt_metal::CreateKernel(program, kernel_name, all_cores, tt::tt_metal::ReaderDataMovementConfig({cb_index})); + bool interface_with_dram = (remote_core_type == CoreType::DRAM); + tt::tt_metal::KernelHandle kernel_id_0 = tt::tt_metal::CreateKernel( + program, + kernel_name, + all_cores, + tt::tt_metal::ReaderDataMovementConfig({cb_index, interface_with_dram})); - tt::tt_metal::KernelHandle kernel_id_1 = - tt::tt_metal::CreateKernel(program, kernel_name, all_cores, tt::tt_metal::WriterDataMovementConfig({cb_index})); + tt::tt_metal::KernelHandle kernel_id_1 = tt::tt_metal::CreateKernel( + program, + kernel_name, + all_cores, + tt::tt_metal::WriterDataMovementConfig({cb_index, interface_with_dram})); tt::tt_metal::CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(total_size, {{cb_index, data_format}}) @@ -359,7 +366,6 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu auto remote_buffer_type = remote_tensor.buffer()->buffer_type(); auto bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[remote_core_idx])[0]; uint32_t bank_offset = device->bank_offset(remote_buffer_type, bank_id); - auto remote_core = device->physical_core_from_logical_core(remote_cores[remote_core_idx], remote_core_type); std::array kernels = {kernel_id_0, kernel_id_1}; uint32_t local_units_left = num_units; @@ -382,17 +388,13 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[remote_core_idx])[0]; bank_offset = device->bank_offset(remote_buffer_type, bank_id); - remote_core = - device->physical_core_from_logical_core(remote_cores[remote_core_idx], remote_core_type); } uint32_t units_to_transfer = std::min(remote_core_units_rem, local_units_to_transfer); - auto remote_core = - device->physical_core_from_logical_core(remote_cores[remote_core_idx], remote_core_type); + bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[remote_core_idx])[0]; kernel_args.insert( kernel_args.end(), - {static_cast(remote_core.x), - static_cast(remote_core.y), - (remote_units_per_shard - remote_core_units_rem) * unit_size + bank_offset, + {bank_id, + (remote_units_per_shard - remote_core_units_rem) * unit_size, units_to_transfer * unit_size}); local_units_per_core -= units_to_transfer; local_units_to_transfer -= units_to_transfer; @@ -481,18 +483,17 @@ operation::ProgramWithCallbacks reshard_multi_core_generic(const Tensor& input, std::vector physical_core_coords; physical_core_coords.reserve(grid.x * grid.y); for (uint32_t i = 0; i < grid.x; i++) { - auto physical_input_core = device->physical_core_from_logical_core(CoreCoord(i, 0), input_core_type); + auto physical_input_core = device->virtual_core_from_logical_core(CoreCoord(i, 0), input_core_type); physical_core_coords.push_back(physical_input_core.x); } for (uint32_t i = 0; i < grid.y; i++) { - auto physical_input_core = device->physical_core_from_logical_core(CoreCoord(0, i), input_core_type); + auto physical_input_core = device->virtual_core_from_logical_core(CoreCoord(0, i), input_core_type); physical_core_coords.push_back(physical_input_core.y); } for (const auto& core : cores) { auto page_stride_vector = output_core_to_page_range_pair.at(core); uint32_t num_ranges = page_stride_vector.size(); - std::vector runtime_args = physical_core_coords; auto runtime_args_0 = get_runtime_args_for_given_ranges( physical_core_coords, page_stride_vector, @@ -540,8 +541,7 @@ operation::ProgramWithCallbacks reshard_multi_core_generic(const Tensor& input, struct WidthShardedRuntimeArgs { uint32_t write_size; uint32_t read_offset; - uint32_t x_coord; - uint32_t y_coord; + uint32_t bank_id; uint32_t write_offset; }; @@ -591,14 +591,10 @@ compute_width_sharded_reshard_runtime_args( auto bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[current_remote_core_idx])[0]; auto bank_offset = device->bank_offset(remote_buffer_type, bank_id); - const auto& remote_core = - device->physical_core_from_logical_core(remote_cores[current_remote_core_idx], remote_core_type); - core_args.emplace_back( element_size * transfer_size, element_size * local_shard_offset, - remote_core.x, - remote_core.y, + bank_id, element_size * remote_shard_offset + bank_offset); local_shard_offset += transfer_size; @@ -640,7 +636,7 @@ operation::ProgramWithCallbacks reshard_multi_core_same_height(const Tensor& inp const auto local_core_type = local_tensor.buffer()->core_type(); const auto remote_core_type = remote_tensor.buffer()->core_type(); - + bool interface_with_dram = (remote_core_type == CoreType::DRAM); const auto local_cores = corerange_to_cores( local_shard_spec.grid, std::nullopt, local_shard_spec.orientation == ShardOrientation::ROW_MAJOR); const auto remote_cores = corerange_to_cores( @@ -667,11 +663,11 @@ operation::ProgramWithCallbacks reshard_multi_core_same_height(const Tensor& inp ? "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp" : "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp"; - tt::tt_metal::KernelHandle kernel_id_0 = - tt::tt_metal::CreateKernel(program, kernel_name, all_cores, tt::tt_metal::ReaderDataMovementConfig({cb_index})); + tt::tt_metal::KernelHandle kernel_id_0 = tt::tt_metal::CreateKernel( + program, kernel_name, all_cores, tt::tt_metal::ReaderDataMovementConfig({cb_index, interface_with_dram})); - tt::tt_metal::KernelHandle kernel_id_1 = - tt::tt_metal::CreateKernel(program, kernel_name, all_cores, tt::tt_metal::WriterDataMovementConfig({cb_index})); + tt::tt_metal::KernelHandle kernel_id_1 = tt::tt_metal::CreateKernel( + program, kernel_name, all_cores, tt::tt_metal::WriterDataMovementConfig({cb_index, interface_with_dram})); uint32_t remote_address = remote_tensor.buffer()->address(); auto remote_buffer_type = remote_tensor.buffer()->buffer_type(); @@ -709,7 +705,7 @@ operation::ProgramWithCallbacks reshard_multi_core_same_height(const Tensor& inp args_for_all_segments.size()}; for (const auto& args : args_for_all_segments) { const std::vector segment_kernel_0 = { - args.write_size, args.read_offset, args.x_coord, args.y_coord, args.write_offset}; + args.write_size, args.read_offset, args.bank_id, args.write_offset}; runtime_args_0.insert(runtime_args_0.end(), segment_kernel_0.begin(), segment_kernel_0.end()); // Adjust read and write offsets to the correct stick address because we are splitting work across 2 kernels @@ -717,7 +713,7 @@ operation::ProgramWithCallbacks reshard_multi_core_same_height(const Tensor& inp const uint32_t adjusted_write_offset = args.write_offset + total_num_sticks_kernel_0 * remote_stride_bytes; const std::vector segment_kernel_1 = { - args.write_size, adjusted_read_offset, args.x_coord, args.y_coord, adjusted_write_offset}; + args.write_size, adjusted_read_offset, args.bank_id, adjusted_write_offset}; runtime_args_1.insert(runtime_args_1.end(), segment_kernel_1.begin(), segment_kernel_1.end()); } SetRuntimeArgs(program, kernel_id_0, local_cores[core_idx], runtime_args_0); diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp index db90fe852634..f4bea017571c 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp @@ -22,381 +22,6 @@ namespace reuse_dram_sharded_optimized_helpers { using ttnn::operations::unary::UnaryOpType; using ttnn::operations::unary::UnaryWithParam; -void get_dram_reader_core_coords_grayskull( - tt::tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for grayskull - uint32_t full_grid_size_y = 12; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - uint32_t max_worker_y_physical = 0; - uint32_t min_worker_y_physical = 10000; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - if (core_phy.y > max_worker_y_physical) { - max_worker_y_physical = core_phy.y; - } - if (core_phy.y < min_worker_y_physical) { - min_worker_y_physical = core_phy.y; - } - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x; - uint32_t adj_core_y = dram_core.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto y = coord.y; - - // if row is harvested, move core down by 1 - while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and - y < (full_grid_size_y - 1)) { - y += 1; - } - - coord.y = y; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - -void get_dram_reader_core_coords_wormhole_b0( - tt::tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for wh_b0 - uint32_t full_grid_size_y = 12; - uint32_t x_step = 3; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - dram_coord_phy.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - all_worker_cores_logical.reserve(num_cores_x * num_cores_y); - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - all_worker_cores_y_physical.reserve(num_cores_y); - uint32_t max_worker_y_physical = 0; - uint32_t min_worker_y_physical = 10000; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - if (core_phy.y > max_worker_y_physical) { - max_worker_y_physical = core_phy.y; - } - if (core_phy.y < min_worker_y_physical) { - min_worker_y_physical = core_phy.y; - } - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - adj_core_physical.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // split the adjacent coords into two groups, because DRAM banks has two cols - std::vector adj_core_physical_g1; - adj_core_physical_g1.reserve(num_banks); - std::vector adj_core_physical_y_g1; - adj_core_physical_y_g1.reserve(num_banks); - std::vector adj_core_physical_g2; - adj_core_physical_g2.reserve(num_banks); - std::vector adj_core_physical_y_g2; - adj_core_physical_y_g2.reserve(num_banks); - for (auto core : adj_core_physical) { - if (core.x == adj_core_physical.front().x) { - adj_core_physical_g1.push_back(core); - } else { - adj_core_physical_g2.push_back(core); - } - } - std::vector indices_g1(adj_core_physical_g1.size()); - std::vector indices_g2(adj_core_physical_g2.size()); - std::iota(indices_g1.begin(), indices_g1.end(), 0); - std::iota(indices_g2.begin(), indices_g2.end(), 0); - std::sort(indices_g1.begin(), indices_g1.end(), [&adj_core_physical_g1](int i1, int i2) { - return adj_core_physical_g1[i1].y < adj_core_physical_g1[i2].y; - }); - std::sort(indices_g2.begin(), indices_g2.end(), [&adj_core_physical_g2](int i1, int i2) { - return adj_core_physical_g2[i1].y < adj_core_physical_g2[i2].y; - }); - std::rotate(indices_g1.begin(), indices_g1.end() - 1, indices_g1.end()); - std::rotate(indices_g2.begin(), indices_g2.end() - 1, indices_g2.end()); - - std::vector indices_g1_realloc(adj_core_physical_g1.size()); - std::vector indices_g2_realloc(adj_core_physical_g2.size()); - for (int new_index = 0; new_index < indices_g1.size(); ++new_index) { - indices_g1_realloc[indices_g1[new_index]] = new_index; - } - for (int new_index = 0; new_index < indices_g2.size(); ++new_index) { - indices_g2_realloc[indices_g2[new_index]] = new_index; - } - - std::sort(adj_core_physical_g1.begin(), adj_core_physical_g1.end(), [](const CoreCoord& a, const CoreCoord& b) { - return a.y < b.y; - }); - std::sort(adj_core_physical_g2.begin(), adj_core_physical_g2.end(), [](const CoreCoord& a, const CoreCoord& b) { - return a.y < b.y; - }); - std::rotate(adj_core_physical_g1.begin(), adj_core_physical_g1.end() - 1, adj_core_physical_g1.end()); - std::rotate(adj_core_physical_g2.begin(), adj_core_physical_g2.end() - 1, adj_core_physical_g2.end()); - - for (auto core : adj_core_physical_g1) { - adj_core_physical_y_g1.push_back(core.y); - } - for (auto core : adj_core_physical_g2) { - adj_core_physical_y_g2.push_back(core.y); - } - - // move the workers, if they are on harvested rows - auto process_group = [&](std::vector& group, std::vector& group_y, uint32_t x_step) { - for (auto& coord : group) { - auto y = coord.y; - - if (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() || - std::count(group_y.begin(), group_y.end(), y) >= 2) { - auto adjust_coord = [&](int start, int end, int step) { - bool found_new_row = false; - for (int j = start; step > 0 ? j <= end : j >= end; j += step) { - if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end() && - std::count(group_y.begin(), group_y.end(), j) == 0) { - coord.y = j; - coord.x += x_step; - x_step--; - found_new_row = true; - break; - } - } - if (not found_new_row) { - for (int j = start; step > 0 ? j <= end : j >= end; j += step) { - if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end()) { - coord.y = j; - coord.x += x_step; - x_step--; - found_new_row = true; - break; - } - } - } - }; - - if (y >= max_bank_id) { - adjust_coord(max_worker_y_physical, min_worker_y_physical, -1); - } else { - adjust_coord(min_worker_y_physical, max_worker_y_physical, 1); - } - } - } - }; - // move the workers, if they are on harvested rows - process_group(adj_core_physical_g1, adj_core_physical_y_g1, x_step); - process_group(adj_core_physical_g2, adj_core_physical_y_g2, x_step); - - // merge two group into one - std::vector adj_core_physical_realloc; - adj_core_physical_realloc.reserve(num_banks); - for (int i = 0; i < indices_g1_realloc.size(); ++i) { - adj_core_physical_realloc.push_back(adj_core_physical_g1[indices_g1_realloc[i]]); - } - for (int i = 0; i < indices_g2_realloc.size(); ++i) { - adj_core_physical_realloc.push_back(adj_core_physical_g2[indices_g2_realloc[i]]); - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - adj_core_logical_realloc.reserve(num_banks); - for (int i = 0; i < adj_core_physical_realloc.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical_realloc[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - -void get_dram_reader_core_coords_blackhole( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for blackhole - uint32_t full_grid_size_x = 17; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get x coords of the workers - std::vector all_worker_cores_x_physical; - for (int i = 0; i < num_cores_x; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(i, 0)); - all_worker_cores_x_physical.push_back(core_phy.x); - } - - // get the harvested cols, we treat dram and eth cores as harvested as well - std::vector harvested_cols; - for (int i = 0; i < full_grid_size_x; ++i) { - auto x = i; - - if (std::find(all_worker_cores_x_physical.begin(), all_worker_cores_x_physical.end(), x) == - all_worker_cores_x_physical.end()) { - harvested_cols.push_back(x); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested cols - for (auto& coord : adj_core_physical) { - auto x = coord.x; - - // if col is harvested, move core right by 1 - while (std::find(harvested_cols.begin(), harvested_cols.end(), x) != harvested_cols.end() and - x < (full_grid_size_x - 1)) { - x += 1; - } - - coord.x = x; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t tile_size, uint32_t& page_size, uint32_t& num_pages) { uint64_t total_size = static_cast(num_tiles) * tile_size; @@ -419,6 +44,15 @@ void move_common_entries(std::vector& v1, std::vector& v2, } } +void get_optimal_dram_bank_to_reader_assignment(Device* device, std::vector& all_worker_cores_ordered, CoreRangeSet& all_worker_cores) { + all_worker_cores_ordered = device->get_optimal_dram_bank_to_logical_worker_assignment(); + std::set all_cores_set; + for (const auto& worker_core : all_worker_cores_ordered) { + all_cores_set.insert(CoreRange(worker_core)); + } + all_worker_cores = CoreRangeSet(all_cores_set); +} + operation::ProgramWithCallbacks create_program_dram_sharded( tt::tt_metal::Device* device, const CoreRangeSet& all_storage_cores, @@ -463,18 +97,9 @@ operation::ProgramWithCallbacks create_program_dram_sharded( tt_metal::Program program{}; // get the dram readers - CoreRangeSet all_worker_cores; std::vector all_worker_cores_ordered; - - if (device->arch() == tt::ARCH::WORMHOLE_B0) { - get_dram_reader_core_coords_wormhole_b0(device, all_worker_cores, all_worker_cores_ordered); - } else if (device->arch() == tt::ARCH::GRAYSKULL) { - get_dram_reader_core_coords_grayskull(device, all_worker_cores, all_worker_cores_ordered); - } else if (device->arch() == tt::ARCH::BLACKHOLE) { - get_dram_reader_core_coords_blackhole(device, all_worker_cores, all_worker_cores_ordered); - } else { - TT_THROW("Device not supported"); - } + CoreRangeSet all_worker_cores; + get_optimal_dram_bank_to_reader_assignment(device, all_worker_cores_ordered, all_worker_cores); // dram banks uint32_t num_dram_banks = all_worker_cores_ordered.size(); From d2065c6381ed8ef70644b54207e806fc5a2268ae Mon Sep 17 00:00:00 2001 From: Jack Date: Mon, 9 Dec 2024 02:03:34 +0000 Subject: [PATCH 37/59] #0: Refactore flash decode kernel for future scaling --- .../device/kernels/compute/compute_common.hpp | 616 ++++++++++++++++++ .../kernels/compute/sdpa_flash_decode.cpp | 531 ++------------- .../kernels/dataflow/dataflow_common.hpp | 304 +++++++++ .../kernels/dataflow/reader_decode_all.cpp | 54 +- .../kernels/dataflow/writer_decode_all.cpp | 232 +------ .../device/{ => kernels}/rt_args_common.hpp | 0 6 files changed, 967 insertions(+), 770 deletions(-) create mode 100644 ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/compute_common.hpp create mode 100644 ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/dataflow_common.hpp rename ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/{ => kernels}/rt_args_common.hpp (100%) diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/compute_common.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/compute_common.hpp new file mode 100644 index 000000000000..4e3b7dff1fb3 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/compute_common.hpp @@ -0,0 +1,616 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#define REDUCE_OP (PoolType::MAX) +#define REDUCE_DIM (ReduceDim::REDUCE_ROW) + +#include "compute_kernel_api.h" +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/eltwise_unary/exp.h" +#include "compute_kernel_api/eltwise_unary/recip.h" +#include "compute_kernel_api/bcast.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/matmul.h" +#include "compute_kernel_api/reduce.h" + +/****************************************************************************** + * * + * Common Functions for Compute Kernels * + * * + ******************************************************************************/ + +/****************************************************************************** + * Generic Compute Functions * + ******************************************************************************/ +void max_block_inplace(uint32_t in0, uint32_t in1, uint32_t num_tiles) { + // inputs come in full, outputs go out full + copy_tile_to_dst_init_short(in0); + max_tile_init(); + + constexpr uint32_t dst_reg_0 = 0; + constexpr uint32_t dst_reg_1 = 1; + cb_wait_front(in0, num_tiles); + cb_wait_front(in1, num_tiles); + for (uint32_t i = 0; i < num_tiles; ++i) { + acquire_dst(); + copy_tile(in0, 0, dst_reg_0); + copy_tile(in1, i, dst_reg_1); + cb_pop_front(in0, 1); + cb_reserve_back(in0, 1); + max_tile(dst_reg_0, dst_reg_1); + pack_tile(dst_reg_0, in0); + cb_push_back(in0, 1); + release_dst(); + } +} + +void max_block(uint32_t in0, uint32_t in1, uint32_t out_cb, uint32_t num_tiles) { + // inputs come in full, outputs go out full + copy_tile_to_dst_init_short(in0); + max_tile_init(); + + constexpr uint32_t dst_reg_0 = 0; + constexpr uint32_t dst_reg_1 = 1; + cb_wait_front(in0, num_tiles); + cb_wait_front(in1, num_tiles); + cb_reserve_back(out_cb, num_tiles); + for (uint32_t i = 0; i < num_tiles; ++i) { + acquire_dst(); + copy_tile(in0, i, dst_reg_0); + copy_tile(in1, i, dst_reg_1); + max_tile(dst_reg_0, dst_reg_1); + pack_tile(dst_reg_0, out_cb, i); + release_dst(); + } + cb_push_back(out_cb, num_tiles); +} + +template < + PoolType pool_type, + ReduceDim reduce_dim, + uint32_t in0_cb, + uint32_t scale_cb, + uint32_t out_cb, + uint32_t rows, + uint32_t cols> +void reduce_c() { + // Precondition: in0_cb has rows*cols produced. in0_cb has tiles in row-major order + // Precondition: scale_cb has 1 produced + // Precondition: out_cb has rows free + // Postcondition: in0_cb has rows*cols produced + // Precondition: scale_cb has 1 produced + // Postcondition: out_cb has rows produced + + reduce_init_delta(in0_cb, scale_cb, out_cb); + + const uint32_t num_tiles = rows * cols; + cb_wait_front(scale_cb, 1); + cb_wait_front(in0_cb, num_tiles); + cb_reserve_back(out_cb, rows); + + constexpr uint32_t reduce_dst_idx = 0; + + for (uint32_t i = 0; i < rows; i++) { + acquire_dst(); + for (uint32_t j = 0; j < cols; j++) { + reduce_tile(in0_cb, scale_cb, i * cols + j, 0, reduce_dst_idx); + } + + cb_reserve_back(out_cb, 1); + pack_tile(reduce_dst_idx, out_cb); + cb_push_back(out_cb, 1); + release_dst(); + } + + reduce_revert_delta(out_cb); +} + +void recip_block_inplace(uint32_t in_cb, uint32_t num_tiles) { + // Precondition: in_cb has num_tiles produced + // Postcondition: in_cb has num_tiles produced + copy_tile_to_dst_init_short(in_cb); + recip_tile_init(); + + cb_wait_front(in_cb, num_tiles); + for (uint32_t i = 0; i < num_tiles; ++i) { + acquire_dst(); + copy_tile(in_cb, 0, 0); + cb_pop_front(in_cb, 1); + recip_tile(0); + cb_reserve_back(in_cb, 1); + pack_tile(0, in_cb); + cb_push_back(in_cb, 1); + release_dst(); + } +} + +void sub_exp_block_bcast_cols_inplace(uint32_t in0_cb, uint32_t in1_cb, uint32_t rows, uint32_t cols) { + // Precondition: in0_cb has rows*cols produced + // Precondition: in1_cb has rows produced + // Postcondition: in0_cb has rows*cols produced + // Postcondition: in1_cb has rows produced + + sub_bcast_cols_init_short(in0_cb, in1_cb); + exp_tile_init(); + cb_wait_front(in0_cb, rows * cols); + cb_wait_front(in1_cb, rows); + + constexpr uint32_t dst_tiles = SUB_EXP_GRANULARITY; + uint32_t granularity = cols >> LOG2_SUB_EXP_GRANULARITY; + for (uint32_t i = 0; i < rows; ++i) { + for (uint32_t u = 0; u < granularity; u++) { + tile_regs_acquire(); + for (uint32_t j = 0; j < dst_tiles; ++j) { + sub_tiles_bcast_cols(in0_cb, in1_cb, j, i, j); + exp_tile(j); + } + tile_regs_commit(); + cb_pop_front(in0_cb, dst_tiles); + cb_reserve_back(in0_cb, dst_tiles); + tile_regs_wait(); + for (uint32_t j = 0; j < dst_tiles; ++j) { + pack_tile(j, in0_cb); + } + cb_push_back(in0_cb, dst_tiles); + tile_regs_release(); + } + } +} + +void mul_block_bcast_cols_inplace(uint32_t in0_cb, uint32_t in1_cb, uint32_t rows, uint32_t cols) { + // Precondition: in0_cb has rows*cols produced + // Precondition: in1_cb has rows produced + // Postcondition: in0_cb has rows*cols produced + // Postcondition: in1_cb has rows consumed + + uint32_t num_tiles = rows * cols; + mul_bcast_cols_init_short(in0_cb, in1_cb); + cb_wait_front(in0_cb, num_tiles); + cb_wait_front(in1_cb, rows); + for (uint32_t i = 0; i < rows; ++i) { + for (uint32_t j = 0; j < cols; ++j) { + acquire_dst(); + mul_tiles_bcast_cols(in0_cb, in1_cb, 0, i, 0); + cb_pop_front(in0_cb, 1); + cb_reserve_back(in0_cb, 1); + pack_tile(0, in0_cb); + cb_push_back(in0_cb, 1); + release_dst(); + } + } + cb_pop_front(in1_cb, rows); +} + +void mul_block_bcast_scalar_inplace(uint32_t in0_cb, uint32_t in1_scalar_cb, uint32_t num_tiles) { + // Precondition: in0_cb has num_tiles produced + // Precondition: in1_scalar_cb has 1 produced + // Postcondition: in0_cb has num_tiles produced + // Postcondition: in1_scalar_cb has 1 produced + + constexpr uint32_t dst_tiles = MUL_BCAST_GRANULARITY; + uint32_t granularity = num_tiles >> LOG2_MUL_BCAST_GRANULARITY; + reconfig_data_format(in0_cb, in1_scalar_cb); + mul_tiles_bcast_scalar_init_short(); + cb_wait_front(in0_cb, num_tiles); + cb_wait_front(in1_scalar_cb, 1); + for (uint32_t g = 0; g < granularity; ++g) { + acquire_dst(); + for (uint32_t i = 0; i < dst_tiles; ++i) { + mul_tiles_bcast_scalar(in0_cb, in1_scalar_cb, i, 0, i); + } + cb_pop_front(in0_cb, dst_tiles); + cb_reserve_back(in0_cb, dst_tiles); + for (uint32_t i = 0; i < dst_tiles; ++i) { + pack_tile(i, in0_cb); + } + cb_push_back(in0_cb, dst_tiles); + release_dst(); + } +} + +template +void add_block_inplace(uint32_t in0_cb, uint32_t in1_cb, uint32_t num_tiles) { + // Precondition: in0_cb and in1_cb have num_tiles produced + // Postcondition: in0_cb has num_tiles produced + // Postcondition: in1_cb has num_tiles consumed + + add_tiles_init(); + cb_wait_front(in0_cb, num_tiles); + cb_wait_front(in1_cb, num_tiles); + for (uint32_t i = 0; i < num_tiles; i++) { + acquire_dst(); + add_tiles(in0_cb, in1_cb, 0, i, 0); + cb_pop_front(in0_cb, 1); + cb_reserve_back(in0_cb, 1); + pack_tile(0, in0_cb); + cb_push_back(in0_cb, 1); + release_dst(); + } + if (pop_in1) { + cb_pop_front(in1_cb, num_tiles); + } +} + +void add_block(uint32_t in0_cb, uint32_t in1_cb, uint32_t out_cb, uint32_t num_tiles) { + // Precondition: in0_cb and in1_cb have num_tiles produced + // Postcondition: in0_cb has num_tiles produced + // Postcondition: in1_cb has num_tiles consumed + + add_tiles_init(); + cb_wait_front(in0_cb, num_tiles); + cb_wait_front(in1_cb, num_tiles); + cb_reserve_back(out_cb, num_tiles); + for (uint32_t i = 0; i < num_tiles; i++) { + acquire_dst(); + add_tiles(in0_cb, in1_cb, i, i, 0); + pack_tile(0, out_cb, i); + release_dst(); + } + cb_push_back(out_cb, num_tiles); + + cb_pop_front(in0_cb, num_tiles); + cb_pop_front(in1_cb, num_tiles); +} + +void mul_block_inplace(uint32_t in0_cb, uint32_t in1_cb, uint32_t num_tiles) { + // Precondition: in0_cb and in1_cb have num_tiles produced + // Postcondition: in0_cb has num_tiles produced + // Postcondition: in1_cb has num_tiles produced + + mul_tiles_init(); + cb_wait_front(in0_cb, num_tiles); + cb_wait_front(in1_cb, num_tiles); + for (uint32_t i = 0; i < num_tiles; i++) { + acquire_dst(); + mul_tiles(in0_cb, in1_cb, 0, i, 0); + cb_pop_front(in0_cb, 1); + cb_reserve_back(in0_cb, 1); + pack_tile(0, in0_cb); + cb_push_back(in0_cb, 1); + release_dst(); + } +} + +void sub_exp_block(uint32_t in0_cb, uint32_t in1_cb, uint32_t out_cb, uint32_t num_tiles) { + // Precondition: in0_cb and in1_cb have num_tiles produced + // Postcondition: out_cb has num_tiles produced + // Postcondition: in0_cb and in1_cb has num_tiles produced + sub_tiles_init(); + exp_tile_init(); + cb_wait_front(in0_cb, num_tiles); + cb_wait_front(in1_cb, num_tiles); + cb_reserve_back(out_cb, num_tiles); + + for (uint32_t i = 0; i < num_tiles; i++) { + acquire_dst(); + sub_tiles(in0_cb, in1_cb, i, i, 0); + exp_tile(0); + pack_tile(0, out_cb); + cb_push_back(out_cb, 1); + release_dst(); + } +} + +void copy_block(uint32_t in_cb, uint32_t out_cb, uint32_t num_tiles) { + // Precondition: in_cb has num_tiles produced + // Precondition: out_cb has num_tiles free + // Postcondition: in_cb has num_tiles consumed + // Postcondition: out_cb has num_tiles produced + + copy_tile_to_dst_init_short(in_cb); + + cb_wait_front(in_cb, num_tiles); + cb_reserve_back(out_cb, num_tiles); + +#pragma GCC unroll 0 + for (uint32_t i = 0; i < num_tiles; i++) { + acquire_dst(); + copy_tile(in_cb, i, 0 /*dst*/); + pack_tile(0, out_cb); + cb_push_back(out_cb, 1); + release_dst(); + } + cb_pop_front(in_cb, num_tiles); +} + +ALWI void cb_matmul_blocks( + const uint32_t& in0_cb, + const uint32_t& in1_cb, + const uint32_t& out_cb, + const uint32_t& M, + const uint32_t& N, + const uint32_t& K, + const uint32_t& num_blocks, + const uint32_t& in0_num_subblocks, + const uint32_t& in1_num_subblocks, + const uint32_t& in0_block_w, + const uint32_t& subblock_h, + const uint32_t& subblock_w, + const bool& transpose) { + // precondition: in0_cb has M*K produced + // preconditino: in1_cb has K*N produced + // postcondition: in0_cb is full, in1_cb is empty + // postcondition: out_cb has M*N produced + + mm_block_init_short( + in0_cb, in1_cb, transpose /*transpose*/, subblock_w /*ct_dim*/, subblock_h /*rt_dim*/, in0_block_w /*kt_dim*/); + + reconfig_data_format(in1_cb, in0_cb); + cb_wait_front(in1_cb, K * N); + + uint32_t output_num_tiles = M * N; + uint32_t out_subblock_num_tiles = subblock_h * subblock_w; + uint32_t in0_index_offset = 0; + + for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; ++in0_subblock) { + uint32_t in1_index_offset = 0; + for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; ++in1_subblock) { + tile_regs_acquire(); + + uint32_t dst_index = 0; + uint32_t in0_index = in0_index_offset; + uint32_t in1_index = in1_index_offset; + + for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { + matmul_block( + in0_cb, in1_cb, in0_index, in1_index, dst_index, transpose, subblock_w, subblock_h, in0_block_w); + in0_index++; + in1_index += N; + } + tile_regs_commit(); + + cb_reserve_back(out_cb, out_subblock_num_tiles); + tile_regs_wait(); + for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { + pack_tile(i, out_cb); + } + tile_regs_release(); + cb_push_back(out_cb, out_subblock_num_tiles); + // in1_index_offset += in1_subblock * subblock_w; + // in1_index_offset = (in1_subblock+1) * subblock_w; + in1_index_offset += subblock_w; + } + in0_index_offset += subblock_h * in0_block_w; + } + cb_pop_front(in1_cb, K * N); +} + +/****************************************************************************** + * Flash Decode Functions * + ******************************************************************************/ + +/** + * Flash attention computation loop + * + * Template Parameters: + * @tparam St - Total sequence length in tiles + * @tparam DHt - Head dimension in tiles + * @tparam Sq_chunk_t - Query chunk size in tiles + * @tparam Sk_chunk_t - Key chunk size in tiles + * @tparam qk_in0_block_w - QK matmul block width + * @tparam qk_subblock_w - QK matmul subblock width + * @tparam qk_subblock_h - QK matmul subblock height + * @tparam qk_in0_num_subblocks - QK input0 subblocks + * @tparam qk_in1_num_subblocks - QK input1 subblocks + * @tparam qk_num_blocks - QK number of blocks + * @tparam out_in0_block_w - Output matmul block width + * @tparam out_subblock_w - Output matmul subblock width + * @tparam out_subblock_h - Output matmul subblock height + * @tparam out_in0_num_subblocks - Output input0 subblocks + * @tparam out_in1_num_subblocks - Output input1 subblocks + * @tparam out_num_blocks - Output number of blocks + * @tparam is_causal - Whether to use causal attention (if mask is applied) + * @tparam use_attention_mask - Whether to use attention mask for non-causal attention + * + * Circular Buffer Parameters: + * @tparam cb_q_in - Query input buffer + * @tparam cb_k_in - Key input buffer + * @tparam cb_v_in - Value input buffer + * @tparam cb_mask_in - Mask input buffer + * @tparam cb_scale_in - Scale input buffer + * @tparam cb_identity_scale_in - Identity scale buffer + * @tparam cb_qk_im - QK intermediate buffer + * @tparam cb_out_im - Output intermediate buffer + * @tparam cb_out_accumulate_im - Output accumulate buffer + * @tparam cb_cur_max - Current max buffer + * @tparam cb_prev_max - Previous max buffer + * @tparam cb_cur_sum - Current sum buffer + * @tparam cb_prev_sum - Previous sum buffer + * @tparam cb_exp_max_diff - Exp max diff buffer + * @tparam cb_out_o - Output O buffer + * @tparam cb_out_m - Output M buffer + * @tparam cb_out_l - Output L buffer + * + * Runtime Parameters: + * @param k_chunk_start - Start index of key chunk + * @param k_chunk_end - End index of key chunk + * @param do_reduce - Whether to perform reduction + * @param qk_chunk_tiles - Number of QK chunk tiles + * @param out_chunk_tiles - Number of output chunk tiles + */ +template < + // Compile-time dimension parameters + uint32_t St, + uint32_t DHt, + uint32_t Sq_chunk_t, + uint32_t Sk_chunk_t, + // QK matmul block parameters + uint32_t qk_in0_block_w, + uint32_t qk_subblock_w, + uint32_t qk_subblock_h, + uint32_t qk_in0_num_subblocks, + uint32_t qk_in1_num_subblocks, + uint32_t qk_num_blocks, + // Output matmul block parameters + uint32_t out_in0_block_w, + uint32_t out_subblock_w, + uint32_t out_subblock_h, + uint32_t out_in0_num_subblocks, + uint32_t out_in1_num_subblocks, + uint32_t out_num_blocks, + // Attention parameters + bool is_causal, + bool use_attention_mask, + // Circular buffer indices + uint32_t cb_q_in, + uint32_t cb_k_in, + uint32_t cb_v_in, + uint32_t cb_mask_in, + uint32_t cb_scale_in, + uint32_t cb_identity_scale_in, + uint32_t cb_qk_im, + uint32_t cb_out_im, + uint32_t cb_out_accumulate_im, + uint32_t cb_cur_max, + uint32_t cb_prev_max, + uint32_t cb_cur_sum, + uint32_t cb_prev_sum, + uint32_t cb_exp_max_diff, + uint32_t cb_out_o, + uint32_t cb_out_m, + uint32_t cb_out_l> +void flash_attention_loop( + // Runtime parameters + uint32_t k_chunk_start, + uint32_t k_chunk_end, + bool do_reduce, + uint32_t qk_chunk_tiles, + uint32_t out_chunk_tiles) { + for (uint32_t k_chunk = k_chunk_start; k_chunk < k_chunk_end; ++k_chunk) { + /* QK = Q_CHUNK @ K_CHUNK */ + reconfig_data_format(cb_q_in, cb_k_in); // DEBUG + pack_reconfig_data_format(cb_qk_im); + cb_matmul_blocks( + cb_q_in, + cb_k_in, + cb_qk_im, + Sq_chunk_t, + Sk_chunk_t, + DHt, + qk_num_blocks, + qk_in0_num_subblocks, + qk_in1_num_subblocks, + qk_in0_block_w, + qk_subblock_h, + qk_subblock_w, + true /*transpose*/); + + /* QK *= SCALE */ + mul_block_bcast_scalar_inplace(cb_qk_im, cb_scale_in, qk_chunk_tiles); + + if constexpr (is_causal) { + // For decode, we only apply mask at the last chunk on reducer core for causal mode + if (k_chunk == k_chunk_end - 1 && do_reduce) { + /* QK += MASK */ + reconfig_data_format(cb_qk_im, cb_mask_in); + add_block_inplace(cb_qk_im, cb_mask_in, qk_chunk_tiles); + } + } else { + if constexpr (use_attention_mask) { + reconfig_data_format(cb_qk_im, cb_mask_in); + add_block_inplace(cb_qk_im, cb_mask_in, qk_chunk_tiles); + } + } + + reconfig_data_format(cb_qk_im, cb_identity_scale_in); + pack_reconfig_data_format(cb_cur_max); + reduce_c< + PoolType::MAX, + ReduceDim::REDUCE_ROW, + cb_qk_im, + cb_identity_scale_in, + cb_cur_max, + Sq_chunk_t, + Sk_chunk_t>(); + + if (k_chunk > k_chunk_start) { + reconfig_data_format(cb_cur_max, cb_prev_max); + max_block_inplace(cb_cur_max, cb_prev_max, Sq_chunk_t); + } + /* QK -= cb_cur_max */ + /* QK = exp(QK)*/ + reconfig_data_format(cb_qk_im, cb_cur_max); + pack_reconfig_data_format(cb_qk_im); + sub_exp_block_bcast_cols_inplace(cb_qk_im, cb_cur_max, Sq_chunk_t, Sk_chunk_t); + + /* cb_cur_sum = sum(cb_qk_im, dim=-1) */ + reconfig_data_format(cb_qk_im, cb_identity_scale_in); + pack_reconfig_data_format(cb_cur_sum); + reduce_c< + PoolType::SUM, + ReduceDim::REDUCE_ROW, + cb_qk_im, + cb_identity_scale_in, + cb_cur_sum, + Sq_chunk_t, + Sk_chunk_t>(); + + /* OUT_IM = QK @ V_CHUNK */ + reconfig_data_format(cb_qk_im, cb_v_in); // DEBUG + pack_reconfig_data_format(cb_out_im); + cb_matmul_blocks( + cb_qk_im, + cb_v_in, + cb_out_im, + Sq_chunk_t, + DHt, + Sk_chunk_t, + out_num_blocks, + out_in0_num_subblocks, + out_in1_num_subblocks, + out_in0_block_w, + out_subblock_h, + out_subblock_w, + false /*transpose*/); + reconfig_data_format_srca(cb_out_im); + cb_pop_front(cb_qk_im, qk_chunk_tiles); + + /* OUT_ACC += OUT_IM */ + if (k_chunk == k_chunk_start) { + reconfig_data_format_srca(cb_out_im); + pack_reconfig_data_format(cb_out_accumulate_im); + copy_block(cb_out_im, cb_out_accumulate_im, out_chunk_tiles); + } else { + reconfig_data_format(cb_prev_max, cb_cur_max); // DEBUG + pack_reconfig_data_format(cb_exp_max_diff); + /* cb_exp_max_diff = torch.exp(cb_prev_max - cb_cur_max) */ + sub_exp_block(cb_prev_max, cb_cur_max, cb_exp_max_diff, Sq_chunk_t); + cb_pop_front(cb_prev_max, Sq_chunk_t); + + /* cb_prev_sum *= cb_exp_max_diff */ + mul_block_inplace(cb_prev_sum, cb_exp_max_diff, Sq_chunk_t); + + /* cb_out_accumulate_im *= cb_exp_max_diff */ + reconfig_data_format(cb_out_accumulate_im, cb_exp_max_diff); // DEBUG + pack_reconfig_data_format(cb_out_accumulate_im); + mul_block_bcast_cols_inplace(cb_out_accumulate_im, cb_exp_max_diff, Sq_chunk_t, DHt); + + /* cb_cur_sum += cb_prev_sum */ + reconfig_data_format(cb_cur_sum, cb_prev_sum); // DEBUG + pack_reconfig_data_format(cb_cur_sum); + add_block_inplace(cb_cur_sum, cb_prev_sum, Sq_chunk_t); + + /* cb_out_accumulate_im += cb_out_im */ + reconfig_data_format(cb_out_accumulate_im, cb_out_im); // DEBUG + pack_reconfig_data_format(cb_out_accumulate_im); + add_block_inplace(cb_out_accumulate_im, cb_out_im, out_chunk_tiles); + } + + if (k_chunk < k_chunk_end - 1 || do_reduce) { + // Set cb_prev_sum and cb_prev_max + reconfig_data_format(cb_cur_max, cb_cur_max); // DEBUG + pack_reconfig_data_format(cb_prev_max); + copy_block(cb_cur_max, cb_prev_max, Sq_chunk_t); + copy_block(cb_cur_sum, cb_prev_sum, Sq_chunk_t); + + } else { + // Write o, m, l into cb_out + copy_block(cb_out_accumulate_im, cb_out_o, out_chunk_tiles); + copy_block(cb_cur_max, cb_out_m, Sq_chunk_t); + copy_block(cb_cur_sum, cb_out_l, Sq_chunk_t); + } + } +} diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp index a7031164f4d8..33b5b006fd5b 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp @@ -16,361 +16,10 @@ #include "compute_kernel_api/matmul.h" #include "compute_kernel_api/reduce.h" -#include "../../rt_args_common.hpp" +#include "ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp" +#include "compute_common.hpp" namespace NAMESPACE { -void max_block_inplace(uint32_t in0, uint32_t in1, uint32_t num_tiles) { - // inputs come in full, outputs go out full - copy_tile_to_dst_init_short(in0); - max_tile_init(); - - constexpr uint32_t dst_reg_0 = 0; - constexpr uint32_t dst_reg_1 = 1; - cb_wait_front(in0, num_tiles); - cb_wait_front(in1, num_tiles); - for (uint32_t i = 0; i < num_tiles; ++i) { - acquire_dst(); - copy_tile(in0, 0, dst_reg_0); - copy_tile(in1, i, dst_reg_1); - cb_pop_front(in0, 1); - cb_reserve_back(in0, 1); - max_tile(dst_reg_0, dst_reg_1); - pack_tile(dst_reg_0, in0); - cb_push_back(in0, 1); - release_dst(); - } -} - -void max_block(uint32_t in0, uint32_t in1, uint32_t out_cb, uint32_t num_tiles) { - // inputs come in full, outputs go out full - copy_tile_to_dst_init_short(in0); - max_tile_init(); - - constexpr uint32_t dst_reg_0 = 0; - constexpr uint32_t dst_reg_1 = 1; - cb_wait_front(in0, num_tiles); - cb_wait_front(in1, num_tiles); - cb_reserve_back(out_cb, num_tiles); - for (uint32_t i = 0; i < num_tiles; ++i) { - acquire_dst(); - copy_tile(in0, i, dst_reg_0); - copy_tile(in1, i, dst_reg_1); - max_tile(dst_reg_0, dst_reg_1); - pack_tile(dst_reg_0, out_cb, i); - release_dst(); - } - cb_push_back(out_cb, num_tiles); -} - -template < - PoolType pool_type, - ReduceDim reduce_dim, - uint32_t in0_cb, - uint32_t scale_cb, - uint32_t out_cb, - uint32_t rows, - uint32_t cols> -void reduce_c() { - // Precondition: in0_cb has rows*cols produced. in0_cb has tiles in row-major order - // Precondition: scale_cb has 1 produced - // Precondition: out_cb has rows free - // Postcondition: in0_cb has rows*cols produced - // Precondition: scale_cb has 1 produced - // Postcondition: out_cb has rows produced - - reduce_init_delta(in0_cb, scale_cb, out_cb); - - const uint32_t num_tiles = rows * cols; - cb_wait_front(scale_cb, 1); - cb_wait_front(in0_cb, num_tiles); - cb_reserve_back(out_cb, rows); - - constexpr uint32_t reduce_dst_idx = 0; - - for (uint32_t i = 0; i < rows; i++) { - acquire_dst(); - for (uint32_t j = 0; j < cols; j++) { - reduce_tile(in0_cb, scale_cb, i * cols + j, 0, reduce_dst_idx); - } - - cb_reserve_back(out_cb, 1); - pack_tile(reduce_dst_idx, out_cb); - cb_push_back(out_cb, 1); - release_dst(); - } - - reduce_revert_delta(out_cb); -} - -void recip_block_inplace(uint32_t in_cb, uint32_t num_tiles) { - // Precondition: in_cb has num_tiles produced - // Postcondition: in_cb has num_tiles produced - copy_tile_to_dst_init_short(in_cb); - recip_tile_init(); - - cb_wait_front(in_cb, num_tiles); - for (uint32_t i = 0; i < num_tiles; ++i) { - acquire_dst(); - copy_tile(in_cb, 0, 0); - cb_pop_front(in_cb, 1); - recip_tile(0); - cb_reserve_back(in_cb, 1); - pack_tile(0, in_cb); - cb_push_back(in_cb, 1); - release_dst(); - } -} - -void sub_exp_block_bcast_cols_inplace(uint32_t in0_cb, uint32_t in1_cb, uint32_t rows, uint32_t cols) { - // Precondition: in0_cb has rows*cols produced - // Precondition: in1_cb has rows produced - // Postcondition: in0_cb has rows*cols produced - // Postcondition: in1_cb has rows produced - - sub_bcast_cols_init_short(in0_cb, in1_cb); - exp_tile_init(); - cb_wait_front(in0_cb, rows * cols); - cb_wait_front(in1_cb, rows); - - constexpr uint32_t dst_tiles = SUB_EXP_GRANULARITY; - uint32_t granularity = cols >> LOG2_SUB_EXP_GRANULARITY; - for (uint32_t i = 0; i < rows; ++i) { - for (uint32_t u = 0; u < granularity; u++) { - tile_regs_acquire(); - for (uint32_t j = 0; j < dst_tiles; ++j) { - sub_tiles_bcast_cols(in0_cb, in1_cb, j, i, j); - exp_tile(j); - } - tile_regs_commit(); - cb_pop_front(in0_cb, dst_tiles); - cb_reserve_back(in0_cb, dst_tiles); - tile_regs_wait(); - for (uint32_t j = 0; j < dst_tiles; ++j) { - pack_tile(j, in0_cb); - } - cb_push_back(in0_cb, dst_tiles); - tile_regs_release(); - } - } -} - -void mul_block_bcast_cols_inplace(uint32_t in0_cb, uint32_t in1_cb, uint32_t rows, uint32_t cols) { - // Precondition: in0_cb has rows*cols produced - // Precondition: in1_cb has rows produced - // Postcondition: in0_cb has rows*cols produced - // Postcondition: in1_cb has rows consumed - - uint32_t num_tiles = rows * cols; - mul_bcast_cols_init_short(in0_cb, in1_cb); - cb_wait_front(in0_cb, num_tiles); - cb_wait_front(in1_cb, rows); - for (uint32_t i = 0; i < rows; ++i) { - for (uint32_t j = 0; j < cols; ++j) { - acquire_dst(); - mul_tiles_bcast_cols(in0_cb, in1_cb, 0, i, 0); - cb_pop_front(in0_cb, 1); - cb_reserve_back(in0_cb, 1); - pack_tile(0, in0_cb); - cb_push_back(in0_cb, 1); - release_dst(); - } - } - cb_pop_front(in1_cb, rows); -} - -void mul_block_bcast_scalar_inplace(uint32_t in0_cb, uint32_t in1_scalar_cb, uint32_t num_tiles) { - // Precondition: in0_cb has num_tiles produced - // Precondition: in1_scalar_cb has 1 produced - // Postcondition: in0_cb has num_tiles produced - // Postcondition: in1_scalar_cb has 1 produced - - constexpr uint32_t dst_tiles = MUL_BCAST_GRANULARITY; - uint32_t granularity = num_tiles >> LOG2_MUL_BCAST_GRANULARITY; - reconfig_data_format(in0_cb, in1_scalar_cb); - mul_tiles_bcast_scalar_init_short(); - cb_wait_front(in0_cb, num_tiles); - cb_wait_front(in1_scalar_cb, 1); - for (uint32_t g = 0; g < granularity; ++g) { - acquire_dst(); - for (uint32_t i = 0; i < dst_tiles; ++i) { - mul_tiles_bcast_scalar(in0_cb, in1_scalar_cb, i, 0, i); - } - cb_pop_front(in0_cb, dst_tiles); - cb_reserve_back(in0_cb, dst_tiles); - for (uint32_t i = 0; i < dst_tiles; ++i) { - pack_tile(i, in0_cb); - } - cb_push_back(in0_cb, dst_tiles); - release_dst(); - } -} - -template -void add_block_inplace(uint32_t in0_cb, uint32_t in1_cb, uint32_t num_tiles) { - // Precondition: in0_cb and in1_cb have num_tiles produced - // Postcondition: in0_cb has num_tiles produced - // Postcondition: in1_cb has num_tiles consumed - - add_tiles_init(); - cb_wait_front(in0_cb, num_tiles); - cb_wait_front(in1_cb, num_tiles); - for (uint32_t i = 0; i < num_tiles; i++) { - acquire_dst(); - add_tiles(in0_cb, in1_cb, 0, i, 0); - cb_pop_front(in0_cb, 1); - cb_reserve_back(in0_cb, 1); - pack_tile(0, in0_cb); - cb_push_back(in0_cb, 1); - release_dst(); - } - if (pop_in1) { - cb_pop_front(in1_cb, num_tiles); - } -} - -void add_block(uint32_t in0_cb, uint32_t in1_cb, uint32_t out_cb, uint32_t num_tiles) { - // Precondition: in0_cb and in1_cb have num_tiles produced - // Postcondition: in0_cb has num_tiles produced - // Postcondition: in1_cb has num_tiles consumed - - add_tiles_init(); - cb_wait_front(in0_cb, num_tiles); - cb_wait_front(in1_cb, num_tiles); - cb_reserve_back(out_cb, num_tiles); - for (uint32_t i = 0; i < num_tiles; i++) { - acquire_dst(); - add_tiles(in0_cb, in1_cb, i, i, 0); - pack_tile(0, out_cb, i); - release_dst(); - } - cb_push_back(out_cb, num_tiles); - - cb_pop_front(in0_cb, num_tiles); - cb_pop_front(in1_cb, num_tiles); -} - -void mul_block_inplace(uint32_t in0_cb, uint32_t in1_cb, uint32_t num_tiles) { - // Precondition: in0_cb and in1_cb have num_tiles produced - // Postcondition: in0_cb has num_tiles produced - // Postcondition: in1_cb has num_tiles produced - - mul_tiles_init(); - cb_wait_front(in0_cb, num_tiles); - cb_wait_front(in1_cb, num_tiles); - for (uint32_t i = 0; i < num_tiles; i++) { - acquire_dst(); - mul_tiles(in0_cb, in1_cb, 0, i, 0); - cb_pop_front(in0_cb, 1); - cb_reserve_back(in0_cb, 1); - pack_tile(0, in0_cb); - cb_push_back(in0_cb, 1); - release_dst(); - } -} - -void sub_exp_block(uint32_t in0_cb, uint32_t in1_cb, uint32_t out_cb, uint32_t num_tiles) { - // Precondition: in0_cb and in1_cb have num_tiles produced - // Postcondition: out_cb has num_tiles produced - // Postcondition: in0_cb and in1_cb has num_tiles produced - sub_tiles_init(); - exp_tile_init(); - cb_wait_front(in0_cb, num_tiles); - cb_wait_front(in1_cb, num_tiles); - cb_reserve_back(out_cb, num_tiles); - - for (uint32_t i = 0; i < num_tiles; i++) { - acquire_dst(); - sub_tiles(in0_cb, in1_cb, i, i, 0); - exp_tile(0); - pack_tile(0, out_cb); - cb_push_back(out_cb, 1); - release_dst(); - } -} - -void copy_block(uint32_t in_cb, uint32_t out_cb, uint32_t num_tiles) { - // Precondition: in_cb has num_tiles produced - // Precondition: out_cb has num_tiles free - // Postcondition: in_cb has num_tiles consumed - // Postcondition: out_cb has num_tiles produced - - copy_tile_to_dst_init_short(in_cb); - - cb_wait_front(in_cb, num_tiles); - cb_reserve_back(out_cb, num_tiles); - -#pragma GCC unroll 0 - for (uint32_t i = 0; i < num_tiles; i++) { - acquire_dst(); - copy_tile(in_cb, i, 0 /*dst*/); - pack_tile(0, out_cb); - cb_push_back(out_cb, 1); - release_dst(); - } - cb_pop_front(in_cb, num_tiles); -} - -ALWI void cb_matmul_blocks( - const uint32_t& in0_cb, - const uint32_t& in1_cb, - const uint32_t& out_cb, - const uint32_t& M, - const uint32_t& N, - const uint32_t& K, - const uint32_t& num_blocks, - const uint32_t& in0_num_subblocks, - const uint32_t& in1_num_subblocks, - const uint32_t& in0_block_w, - const uint32_t& subblock_h, - const uint32_t& subblock_w, - const bool& transpose) { - // precondition: in0_cb has M*K produced - // preconditino: in1_cb has K*N produced - // postcondition: in0_cb is full, in1_cb is empty - // postcondition: out_cb has M*N produced - - mm_block_init_short( - in0_cb, in1_cb, transpose /*transpose*/, subblock_w /*ct_dim*/, subblock_h /*rt_dim*/, in0_block_w /*kt_dim*/); - - reconfig_data_format(in1_cb, in0_cb); - cb_wait_front(in1_cb, K * N); - - uint32_t output_num_tiles = M * N; - uint32_t out_subblock_num_tiles = subblock_h * subblock_w; - uint32_t in0_index_offset = 0; - - for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; ++in0_subblock) { - uint32_t in1_index_offset = 0; - for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; ++in1_subblock) { - tile_regs_acquire(); - - uint32_t dst_index = 0; - uint32_t in0_index = in0_index_offset; - uint32_t in1_index = in1_index_offset; - - for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) { - matmul_block( - in0_cb, in1_cb, in0_index, in1_index, dst_index, transpose, subblock_w, subblock_h, in0_block_w); - in0_index++; - in1_index += N; - } - tile_regs_commit(); - - cb_reserve_back(out_cb, out_subblock_num_tiles); - tile_regs_wait(); - for (uint32_t i = 0; i < out_subblock_num_tiles; i++) { - pack_tile(i, out_cb); - } - tile_regs_release(); - cb_push_back(out_cb, out_subblock_num_tiles); - // in1_index_offset += in1_subblock * subblock_w; - // in1_index_offset = (in1_subblock+1) * subblock_w; - in1_index_offset += subblock_w; - } - in0_index_offset += subblock_h * in0_block_w; - } - cb_pop_front(in1_cb, K * N); -} void MAIN { constexpr uint32_t St = get_compile_time_arg_val(0); @@ -481,141 +130,47 @@ void MAIN { cb_wait_front(cb_q_in, q_chunk_tiles); for (uint32_t cur_head_work = 0; cur_head_work < num_heads_per_core; ++cur_head_work) { - // loop while k_low < q_high - for (uint32_t k_chunk = k_chunk_start; k_chunk < k_chunk_end; ++k_chunk) { - /* QK = Q_CHUNK @ K_CHUNK */ - reconfig_data_format(cb_q_in, cb_k_in); // DEBUG - pack_reconfig_data_format(cb_qk_im); - cb_matmul_blocks( - cb_q_in, - cb_k_in, - cb_qk_im, - Sq_chunk_t, - Sk_chunk_t, - DHt, - qk_num_blocks, - qk_in0_num_subblocks, - qk_in1_num_subblocks, - qk_in0_block_w, - qk_subblock_h, - qk_subblock_w, - true /*transpose*/); - - /* QK *= SCALE */ - mul_block_bcast_scalar_inplace(cb_qk_im, cb_scale_in, qk_chunk_tiles); - - if constexpr (is_causal) { - // For decode, we only apply mask at the last chunk on reducer core for causal mode - if (k_chunk == k_chunk_end - 1 && do_reduce) { - /* QK += MASK */ - reconfig_data_format(cb_qk_im, cb_mask_in); - add_block_inplace(cb_qk_im, cb_mask_in, qk_chunk_tiles); - } - } else { - if constexpr (use_attention_mask) { - reconfig_data_format(cb_qk_im, cb_mask_in); - add_block_inplace(cb_qk_im, cb_mask_in, qk_chunk_tiles); - } - } - - reconfig_data_format(cb_qk_im, cb_identity_scale_in); - pack_reconfig_data_format(cb_cur_max); - reduce_c< - PoolType::MAX, - ReduceDim::REDUCE_ROW, - cb_qk_im, - cb_identity_scale_in, - cb_cur_max, - Sq_chunk_t, - Sk_chunk_t>(); - - if (k_chunk > k_chunk_start) { - reconfig_data_format(cb_cur_max, cb_prev_max); - max_block_inplace(cb_cur_max, cb_prev_max, Sq_chunk_t); - } - /* QK -= cb_cur_max */ - /* QK = exp(QK)*/ - reconfig_data_format(cb_qk_im, cb_cur_max); - pack_reconfig_data_format(cb_qk_im); - sub_exp_block_bcast_cols_inplace(cb_qk_im, cb_cur_max, Sq_chunk_t, Sk_chunk_t); - - /* cb_cur_sum = sum(cb_qk_im, dim=-1) */ - reconfig_data_format(cb_qk_im, cb_identity_scale_in); - pack_reconfig_data_format(cb_cur_sum); - reduce_c< - PoolType::SUM, - ReduceDim::REDUCE_ROW, - cb_qk_im, - cb_identity_scale_in, - cb_cur_sum, - Sq_chunk_t, - Sk_chunk_t>(); - - /* OUT_IM = QK @ V_CHUNK */ - reconfig_data_format(cb_qk_im, cb_v_in); // DEBUG - pack_reconfig_data_format(cb_out_im); - cb_matmul_blocks( - cb_qk_im, - cb_v_in, - cb_out_im, - Sq_chunk_t, - DHt, - Sk_chunk_t, - out_num_blocks, - out_in0_num_subblocks, - out_in1_num_subblocks, - out_in0_block_w, - out_subblock_h, - out_subblock_w, - false /*transpose*/); - reconfig_data_format_srca(cb_out_im); - cb_pop_front(cb_qk_im, qk_chunk_tiles); - - /* OUT_ACC += OUT_IM */ - if (k_chunk == k_chunk_start) { - reconfig_data_format_srca(cb_out_im); - pack_reconfig_data_format(cb_out_accumulate_im); - copy_block(cb_out_im, cb_out_accumulate_im, out_chunk_tiles); - } else { - reconfig_data_format(cb_prev_max, cb_cur_max); // DEBUG - pack_reconfig_data_format(cb_exp_max_diff); - /* cb_exp_max_diff = torch.exp(cb_prev_max - cb_cur_max) */ - sub_exp_block(cb_prev_max, cb_cur_max, cb_exp_max_diff, Sq_chunk_t); - cb_pop_front(cb_prev_max, Sq_chunk_t); - - /* cb_prev_sum *= cb_exp_max_diff */ - mul_block_inplace(cb_prev_sum, cb_exp_max_diff, Sq_chunk_t); - - /* cb_out_accumulate_im *= cb_exp_max_diff */ - reconfig_data_format(cb_out_accumulate_im, cb_exp_max_diff); // DEBUG - pack_reconfig_data_format(cb_out_accumulate_im); - mul_block_bcast_cols_inplace(cb_out_accumulate_im, cb_exp_max_diff, Sq_chunk_t, DHt); - - /* cb_cur_sum += cb_prev_sum */ - reconfig_data_format(cb_cur_sum, cb_prev_sum); // DEBUG - pack_reconfig_data_format(cb_cur_sum); - add_block_inplace(cb_cur_sum, cb_prev_sum, Sq_chunk_t); - - /* cb_out_accumulate_im += cb_out_im */ - reconfig_data_format(cb_out_accumulate_im, cb_out_im); // DEBUG - pack_reconfig_data_format(cb_out_accumulate_im); - add_block_inplace(cb_out_accumulate_im, cb_out_im, out_chunk_tiles); - } - - if (k_chunk < k_chunk_end - 1 || do_reduce) { - // Set cb_prev_sum and cb_prev_max - reconfig_data_format(cb_cur_max, cb_cur_max); // DEBUG - pack_reconfig_data_format(cb_prev_max); - copy_block(cb_cur_max, cb_prev_max, Sq_chunk_t); - copy_block(cb_cur_sum, cb_prev_sum, Sq_chunk_t); - - } else { - // Write o, m, l into cb_out - copy_block(cb_out_accumulate_im, cb_out_o, out_chunk_tiles); - copy_block(cb_cur_max, cb_out_m, Sq_chunk_t); - copy_block(cb_cur_sum, cb_out_l, Sq_chunk_t); - } - } + flash_attention_loop< + // Compile-time dimension parameters + St, + DHt, + Sq_chunk_t, + Sk_chunk_t, + // QK matmul block parameters + qk_in0_block_w, + qk_subblock_w, + qk_subblock_h, + qk_in0_num_subblocks, + qk_in1_num_subblocks, + qk_num_blocks, + // Output matmul block parameters + out_in0_block_w, + out_subblock_w, + out_subblock_h, + out_in0_num_subblocks, + out_in1_num_subblocks, + out_num_blocks, + // Attention parameters + is_causal, + use_attention_mask, + // Circular buffer indices + cb_q_in, + cb_k_in, + cb_v_in, + cb_mask_in, + cb_scale_in, + cb_identity_scale_in, + cb_qk_im, + cb_out_im, + cb_out_accumulate_im, + cb_cur_max, + cb_prev_max, + cb_cur_sum, + cb_prev_sum, + cb_exp_max_diff, + cb_out_o, + cb_out_m, + cb_out_l>(k_chunk_start, k_chunk_end, do_reduce, qk_chunk_tiles, out_chunk_tiles); // do reduction across intermediates from other cores if this is the reduction core if (do_reduce) { diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/dataflow_common.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/dataflow_common.hpp new file mode 100644 index 000000000000..731a30c2652e --- /dev/null +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/dataflow_common.hpp @@ -0,0 +1,304 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "dataflow_api.h" +#include + +/****************************************************************************** + * * + * Common Functions for Dataflow Kernels * + * * + ******************************************************************************/ + +/****************************************************************************** + * Generic Utility Functions * + ******************************************************************************/ +template +constexpr uint32_t get_barrier_read_threshold() { + return ((512 / num_readers) * (1024 + 128)) / tile_bytes; +} + +/****************************************************************************** + * Page Cache Functions * + ******************************************************************************/ +template +uint32_t virtual_seq_tile_id_to_physical_tile_id( + uint32_t seq_tile_idx, uint32_t cur_head, volatile tt_l1_ptr const uint32_t* const page_table_ptr) { + // Given some index in the sequence tiles in range [0, max_seq_len_t] + // Return the physical tile id for that tile row + constexpr uint32_t block_stride = num_heads * block_size_t * Wt; + const uint32_t head_offset = cur_head * block_size_t * Wt; + + const uint32_t virtual_block = seq_tile_idx / block_size_t; + const uint32_t physical_block = page_table_ptr[virtual_block]; + const uint32_t block_row_offset = seq_tile_idx % block_size_t; + const uint32_t block_offset = block_row_offset * Wt; + return physical_block * block_stride + head_offset + block_offset; +} + +/****************************************************************************** + * Generic Tile Manipulation Functions * + ******************************************************************************/ +template +void copy_tile(uint64_t noc_read_addr_base, uint32_t q_write_ptr_base, uint32_t src_tile_id, uint32_t dst_tile_id) { + noc_async_read( + noc_read_addr_base + src_tile_id * tile_bytes, q_write_ptr_base + dst_tile_id * tile_bytes, tile_bytes); +} + +template +void fill_tile(uint32_t cb_id, uint32_t tile_id, uint32_t val) { + if (val == 0) { + constexpr uint32_t num_zeros_reads = 2048 / MEM_ZEROS_SIZE; + uint64_t zeros_noc_addr = get_noc_addr(MEM_ZEROS_BASE); + uint32_t write_addr = get_write_ptr(cb_id) + tile_id * tile_bytes; + volatile tt_l1_ptr uint32_t* ptr = reinterpret_cast(write_addr); + + // Fill tile with zeros + for (uint32_t i = 0; i < num_zeros_reads; ++i) { + noc_async_read(zeros_noc_addr, write_addr, MEM_ZEROS_SIZE); + write_addr += MEM_ZEROS_SIZE; + } + noc_async_read_barrier(); + } else { + // Fill 2 uint16 datums in each writes to optimize for performance + volatile tt_l1_ptr uint32_t* ptr = + reinterpret_cast(get_write_ptr(cb_id) + tile_id * tile_bytes); + constexpr int num_uint32_datums_tile = (32 * 32) / 2; + for (int k = 0; k < num_uint32_datums_tile; k++) { + ptr[k] = val; + } + } +} + +template +void fill_tile_partial(uint32_t cb_id, uint32_t tile_id, uint32_t cur_pos_in_tile, uint32_t partial_val) { + /* + We want to fill cur_pos_in_tile + 1 to the end + */ + + fill_tile(cb_id, tile_id, 0); + if (cur_pos_in_tile == 31 || partial_val == 0) { + return; + } + const uint16_t datum_val = partial_val >> 16; + volatile tt_l1_ptr uint16_t* uint16_ptr = + reinterpret_cast(get_write_ptr(cb_id) + tile_id * tile_bytes); + volatile tt_l1_ptr uint32_t* uint32_ptr = + reinterpret_cast(get_write_ptr(cb_id) + tile_id * tile_bytes); + int face_start = (cur_pos_in_tile < 15) ? 0 : 1; + uint32_t fill_pos_in_face = (cur_pos_in_tile + 1) % 16; + if (face_start == 0) { + // Fill 2 datums in each writes to optimize for performance + constexpr int num_uint32_datums_tile_face = (16 * 16) / 2; + for (int k = 1; k < 4; k += 2) { + uint32_t uint32_face_idx = k << 7; + for (int j = 0; j < num_uint32_datums_tile_face; j++) { + uint32_ptr[uint32_face_idx + j] = partial_val; + } + } + } + + // Again, optimizing performance by filling 2 uint16 datums in each write. + // If the fill_pos_in_face is odd then we fill that pos with single datum, + // otherwise we fill 2 datums in each write + bool is_odd_pos_filled = fill_pos_in_face % 2 == 1; + uint32_t fill_pos_in_uint32_face = (fill_pos_in_face + 1) >> 1; + constexpr uint32_t num_cols_in_face = 16; + constexpr uint32_t num_rows_in_face = 16; + constexpr uint32_t num_cols_in_uint32_face = num_cols_in_face >> 1; + for (int k = face_start; k < 4; k += 2) { + uint32_t uint16_face_idx = k << 8; + uint32_t uint32_face_idx = k << 7; + + for (uint32_t face_row_idx = 0; face_row_idx < num_rows_in_face; face_row_idx++) { + // Here, if the fill_pos_in_face is odd then we fill that pos with single uint16 value + if (is_odd_pos_filled) { + uint16_ptr[uint16_face_idx + (fill_pos_in_face + num_cols_in_face * face_row_idx)] = datum_val; + } + + for (uint32_t uint32_face_col_idx = fill_pos_in_uint32_face; uint32_face_col_idx < num_cols_in_uint32_face; + uint32_face_col_idx++) { + uint32_ptr[uint32_face_idx + (uint32_face_col_idx + num_cols_in_uint32_face * face_row_idx)] = + partial_val; + } + } + } +} + +/****************************************************************************** + * Attention Mask Functions * + ******************************************************************************/ +template < + uint32_t cb_mask_in, + uint32_t mask_chunk_tiles, + uint32_t mask_tile_bytes, + uint32_t barrier_threshold, + uint32_t PNHt, + uint32_t Sk_chunk_t> +uint32_t read_mask_chunk(uint32_t PSt, uint32_t mask_start_tile_id, const InterleavedAddrGenFast mask_reader) { + // Read mask chunk + cb_reserve_back(cb_mask_in, mask_chunk_tiles); + uint32_t mask_write_ptr = get_write_ptr(cb_mask_in); + uint32_t barrier_count = 0; + for (uint32_t row = 0; row < PNHt; ++row) { + uint32_t mask_tile_id = mask_start_tile_id + row * PSt; + for (uint32_t col = 0; col < Sk_chunk_t; ++col) { + noc_async_read_tile(mask_tile_id, mask_reader, mask_write_ptr); + mask_tile_id++; + mask_write_ptr += mask_tile_bytes; + + if (++barrier_count == barrier_threshold) { + noc_async_read_barrier(); + barrier_count = 0; + } + } + } + noc_async_read_barrier(); + cb_push_back(cb_mask_in, mask_chunk_tiles); + mask_start_tile_id += mask_chunk_tiles; + return mask_start_tile_id; +} + +template +void generate_mask(uint32_t k_num_chunks, uint32_t PSt, uint32_t cur_pos) { + /* + example 1: 64 seqlen at cur_pos 40, 2 cores, 32 chunk size + PSt = 2 + k_num_chunks = 2 + Sk_chunk_t = 1 + cur_pos = 40 + cur_pos_in_chunk = 8 + cur_pos_in_chunk_t = 0 + cur_pos_in_tile = 8 + + example 2: 1024 seqlen at cur_pos 990, 2 cores, 128 chunk size + PSt = 32 + k_num_chunks = 8 + Sk_chunk_t = 4 + cur_pos = 990 + cur_pos_in_chunk = 94 + cur_pos_in_chunk_t = 2 + cur_pos_in_tile = 30 + + example 3: 64 seqlen at cur_pos 63, 2 cores, 32 chunk size + PSt = 2 + k_num_chunks = 2 + Sk_chunk_t = 1 + cur_pos = 63 + cur_pos_in_chunk = 31 + cur_pos_in_chunk_t = 0 + cur_pos_in_tile = 31 + + example 3: 64 seqlen at cur_pos 0, 2 cores, 32 chunk size + PSt = 2 + k_num_chunks = 2 + Sk_chunk_t = 1 + cur_pos = 0 + cur_pos_in_chunk = 0 + cur_pos_in_chunk_t = 0 + cur_pos_in_tile = 0 + */ + + uint32_t Sk_chunk_t = PSt / k_num_chunks; + // the cb_mask in is of size PNHt * Sk_chunk_t + uint32_t total_read_tiles = PNHt * Sk_chunk_t; + uint32_t cur_pos_in_chunk = cur_pos % (Sk_chunk_t * 32); + uint32_t cur_pos_in_chunk_t = cur_pos_in_chunk / 32; + uint32_t cur_pos_in_tile = cur_pos_in_chunk % 32; + constexpr uint32_t NEG_INF = 0xFF80FF80; // TODO: Make sure this is -inf + + cb_reserve_back(cb_mask_in, total_read_tiles); + + uint64_t noc_read_addr_base = get_noc_addr(get_read_ptr(cb_mask_in)); + uint32_t q_write_ptr_base = get_read_ptr(cb_mask_in); + constexpr uint32_t tile_bytes = get_tile_size(cb_mask_in); + + for (uint32_t i = 0; i < Sk_chunk_t; ++i) { + if (i < cur_pos_in_chunk_t) { + // fill with zero + if (i == 0) { + fill_tile(cb_mask_in, i, 0); + } else { + copy_tile( + noc_read_addr_base, q_write_ptr_base, 0, i); // copy from cb_mask_in[0] to cb_mask_in[i] + if (i == cur_pos_in_chunk_t - 1) { + noc_async_read_barrier(); + } + } + } else if (i == cur_pos_in_chunk_t) { + // fill with partial zero/-inf + fill_tile_partial(cb_mask_in, i, cur_pos_in_tile, NEG_INF); + } else { + // fill with -inf + if (i == cur_pos_in_chunk_t + 1) { + fill_tile(cb_mask_in, i, NEG_INF); + } else { + copy_tile( + noc_read_addr_base, + q_write_ptr_base, + cur_pos_in_chunk_t + 1, + i); // copy from cb_mask_in[cur_pos_in_chunk_t+1] to cb_mask_in[i] + if (i == Sk_chunk_t - 1) { + noc_async_read_barrier(); + } + } + } + for (uint32_t j = 1; j < PNHt; ++j) { + // copy from cb_mask_in[i] to cb_mask_in[j*Sk_chunk_t + i] + copy_tile(noc_read_addr_base, q_write_ptr_base, i, j * Sk_chunk_t + i); + if (j == PNHt - 1) { + noc_async_read_barrier(); + } + } + } + + cb_push_back(cb_mask_in, total_read_tiles); +} + +/****************************************************************************** + * Writer Kernel Specific Functions * + ******************************************************************************/ + +template < + uint32_t out_chunk_tiles, + uint32_t cb_out, + uint32_t cb_out_m, + uint32_t cb_out_l, + uint32_t cb_intermed_out, + uint32_t PNHt> +void worker_compute( + uint64_t in0_sender_semaphore_noc_addr, + uint32_t worker_id, + uint32_t reduce_core_noc_x, + uint32_t reduce_core_noc_y) { + uint32_t out_tile_id = 0; + + // Wait for compute to deliver output chunk + cb_wait_front(cb_out, out_chunk_tiles); + cb_wait_front(cb_out_m, PNHt); + cb_wait_front(cb_out_l, PNHt); + + // Write output chunk to reducer + constexpr uint32_t tile_bytes = get_tile_size(cb_out); + uint32_t worker_offset = worker_id * (out_chunk_tiles + 2 * PNHt) * tile_bytes; + constexpr uint32_t o_write_size = out_chunk_tiles * tile_bytes; + constexpr uint32_t ml_write_size = PNHt * tile_bytes; + uint64_t output_write_addr = + get_noc_addr(reduce_core_noc_x, reduce_core_noc_y, get_write_ptr(cb_intermed_out)) + worker_offset; + noc_async_write(get_read_ptr(cb_out), output_write_addr, o_write_size); + output_write_addr += o_write_size; + noc_async_write(get_read_ptr(cb_out_m), output_write_addr, ml_write_size); + output_write_addr += ml_write_size; + noc_async_write(get_read_ptr(cb_out_l), output_write_addr, ml_write_size); + + // increment semaphore + noc_async_write_barrier(); + noc_semaphore_inc(in0_sender_semaphore_noc_addr, 1); + + // pop front + cb_pop_front(cb_out, out_chunk_tiles); + cb_pop_front(cb_out_m, PNHt); + cb_pop_front(cb_out_l, PNHt); +} diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp index 3c56e41f73a9..da3d06f9a6fc 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp @@ -6,58 +6,8 @@ #include "dataflow_api.h" #include -#include "../../rt_args_common.hpp" - -template -constexpr uint32_t get_barrier_read_threshold() { - return ((512 / num_readers) * (1024 + 128)) / tile_bytes; -} - -template -uint32_t virtual_seq_tile_id_to_physical_tile_id( - uint32_t seq_tile_idx, uint32_t cur_head, volatile tt_l1_ptr const uint32_t* const page_table_ptr) { - // Given some index in the sequence tiles in range [0, max_seq_len_t] - // Return the physical tile id for that tile row - constexpr uint32_t block_stride = num_heads * block_size_t * Wt; - const uint32_t head_offset = cur_head * block_size_t * Wt; - - const uint32_t virtual_block = seq_tile_idx / block_size_t; - const uint32_t physical_block = page_table_ptr[virtual_block]; - const uint32_t block_row_offset = seq_tile_idx % block_size_t; - const uint32_t block_offset = block_row_offset * Wt; - return physical_block * block_stride + head_offset + block_offset; -} - -template < - uint32_t cb_mask_in, - uint32_t mask_chunk_tiles, - uint32_t mask_tile_bytes, - uint32_t barrier_threshold, - uint32_t PNHt, - uint32_t Sk_chunk_t> -uint32_t read_mask_chunk(uint32_t PSt, uint32_t mask_start_tile_id, const InterleavedAddrGenFast mask_reader) { - // Read mask chunk - cb_reserve_back(cb_mask_in, mask_chunk_tiles); - uint32_t mask_write_ptr = get_write_ptr(cb_mask_in); - uint32_t barrier_count = 0; - for (uint32_t row = 0; row < PNHt; ++row) { - uint32_t mask_tile_id = mask_start_tile_id + row * PSt; - for (uint32_t col = 0; col < Sk_chunk_t; ++col) { - noc_async_read_tile(mask_tile_id, mask_reader, mask_write_ptr); - mask_tile_id++; - mask_write_ptr += mask_tile_bytes; - - if (++barrier_count == barrier_threshold) { - noc_async_read_barrier(); - barrier_count = 0; - } - } - } - noc_async_read_barrier(); - cb_push_back(cb_mask_in, mask_chunk_tiles); - mask_start_tile_id += mask_chunk_tiles; - return mask_start_tile_id; -} +#include "ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp" +#include "dataflow_common.hpp" void kernel_main() { /* diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp index a59e87292411..7188ba77d7f4 100644 --- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp +++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp @@ -7,236 +7,8 @@ #include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_reduce_scaler.hpp" #include "debug/assert.h" -#include "../../rt_args_common.hpp" - -template -constexpr uint32_t get_barrier_read_threshold() { - return ((512 / num_readers) * (1024 + 128)) / tile_bytes; -} - -template -void copy_tile(uint64_t noc_read_addr_base, uint32_t q_write_ptr_base, uint32_t src_tile_id, uint32_t dst_tile_id) { - noc_async_read( - noc_read_addr_base + src_tile_id * tile_bytes, q_write_ptr_base + dst_tile_id * tile_bytes, tile_bytes); -} - -template -void fill_tile(uint32_t cb_id, uint32_t tile_id, uint32_t val) { - if (val == 0) { - constexpr uint32_t num_zeros_reads = 2048 / MEM_ZEROS_SIZE; - uint64_t zeros_noc_addr = get_noc_addr(MEM_ZEROS_BASE); - uint32_t write_addr = get_write_ptr(cb_id) + tile_id * tile_bytes; - volatile tt_l1_ptr uint32_t* ptr = reinterpret_cast(write_addr); - - // Fill tile with zeros - for (uint32_t i = 0; i < num_zeros_reads; ++i) { - noc_async_read(zeros_noc_addr, write_addr, MEM_ZEROS_SIZE); - write_addr += MEM_ZEROS_SIZE; - } - noc_async_read_barrier(); - } else { - // Fill 2 uint16 datums in each writes to optimize for performance - volatile tt_l1_ptr uint32_t* ptr = - reinterpret_cast(get_write_ptr(cb_id) + tile_id * tile_bytes); - constexpr int num_uint32_datums_tile = (32 * 32) / 2; - for (int k = 0; k < num_uint32_datums_tile; k++) { - ptr[k] = val; - } - } -} - -template -void fill_tile_partial(uint32_t cb_id, uint32_t tile_id, uint32_t cur_pos_in_tile, uint32_t partial_val) { - /* - We want to fill cur_pos_in_tile + 1 to the end - */ - - fill_tile(cb_id, tile_id, 0); - if (cur_pos_in_tile == 31 || partial_val == 0) { - return; - } - const uint16_t datum_val = partial_val >> 16; - volatile tt_l1_ptr uint16_t* uint16_ptr = - reinterpret_cast(get_write_ptr(cb_id) + tile_id * tile_bytes); - volatile tt_l1_ptr uint32_t* uint32_ptr = - reinterpret_cast(get_write_ptr(cb_id) + tile_id * tile_bytes); - int face_start = (cur_pos_in_tile < 15) ? 0 : 1; - uint32_t fill_pos_in_face = (cur_pos_in_tile + 1) % 16; - if (face_start == 0) { - // Fill 2 datums in each writes to optimize for performance - constexpr int num_uint32_datums_tile_face = (16 * 16) / 2; - for (int k = 1; k < 4; k += 2) { - uint32_t uint32_face_idx = k << 7; - for (int j = 0; j < num_uint32_datums_tile_face; j++) { - uint32_ptr[uint32_face_idx + j] = partial_val; - } - } - } - - // Again, optimizing performance by filling 2 uint16 datums in each write. - // If the fill_pos_in_face is odd then we fill that pos with single datum, - // otherwise we fill 2 datums in each write - bool is_odd_pos_filled = fill_pos_in_face % 2 == 1; - uint32_t fill_pos_in_uint32_face = (fill_pos_in_face + 1) >> 1; - constexpr uint32_t num_cols_in_face = 16; - constexpr uint32_t num_rows_in_face = 16; - constexpr uint32_t num_cols_in_uint32_face = num_cols_in_face >> 1; - for (int k = face_start; k < 4; k += 2) { - uint32_t uint16_face_idx = k << 8; - uint32_t uint32_face_idx = k << 7; - - for (uint32_t face_row_idx = 0; face_row_idx < num_rows_in_face; face_row_idx++) { - // Here, if the fill_pos_in_face is odd then we fill that pos with single uint16 value - if (is_odd_pos_filled) { - uint16_ptr[uint16_face_idx + (fill_pos_in_face + num_cols_in_face * face_row_idx)] = datum_val; - } - - for (uint32_t uint32_face_col_idx = fill_pos_in_uint32_face; uint32_face_col_idx < num_cols_in_uint32_face; - uint32_face_col_idx++) { - uint32_ptr[uint32_face_idx + (uint32_face_col_idx + num_cols_in_uint32_face * face_row_idx)] = - partial_val; - } - } - } -} - -template -void generate_mask(uint32_t k_num_chunks, uint32_t PSt, uint32_t cur_pos) { - /* - example 1: 64 seqlen at cur_pos 40, 2 cores, 32 chunk size - PSt = 2 - k_num_chunks = 2 - Sk_chunk_t = 1 - cur_pos = 40 - cur_pos_in_chunk = 8 - cur_pos_in_chunk_t = 0 - cur_pos_in_tile = 8 - - example 2: 1024 seqlen at cur_pos 990, 2 cores, 128 chunk size - PSt = 32 - k_num_chunks = 8 - Sk_chunk_t = 4 - cur_pos = 990 - cur_pos_in_chunk = 94 - cur_pos_in_chunk_t = 2 - cur_pos_in_tile = 30 - - example 3: 64 seqlen at cur_pos 63, 2 cores, 32 chunk size - PSt = 2 - k_num_chunks = 2 - Sk_chunk_t = 1 - cur_pos = 63 - cur_pos_in_chunk = 31 - cur_pos_in_chunk_t = 0 - cur_pos_in_tile = 31 - - example 3: 64 seqlen at cur_pos 0, 2 cores, 32 chunk size - PSt = 2 - k_num_chunks = 2 - Sk_chunk_t = 1 - cur_pos = 0 - cur_pos_in_chunk = 0 - cur_pos_in_chunk_t = 0 - cur_pos_in_tile = 0 - */ - - uint32_t Sk_chunk_t = PSt / k_num_chunks; - // the cb_mask in is of size PNHt * Sk_chunk_t - uint32_t total_read_tiles = PNHt * Sk_chunk_t; - uint32_t cur_pos_in_chunk = cur_pos % (Sk_chunk_t * 32); - uint32_t cur_pos_in_chunk_t = cur_pos_in_chunk / 32; - uint32_t cur_pos_in_tile = cur_pos_in_chunk % 32; - constexpr uint32_t NEG_INF = 0xFF80FF80; // TODO: Make sure this is -inf - - cb_reserve_back(cb_mask_in, total_read_tiles); - - uint64_t noc_read_addr_base = get_noc_addr(get_read_ptr(cb_mask_in)); - uint32_t q_write_ptr_base = get_read_ptr(cb_mask_in); - constexpr uint32_t tile_bytes = get_tile_size(cb_mask_in); - - for (uint32_t i = 0; i < Sk_chunk_t; ++i) { - if (i < cur_pos_in_chunk_t) { - // fill with zero - if (i == 0) { - fill_tile(cb_mask_in, i, 0); - } else { - copy_tile( - noc_read_addr_base, q_write_ptr_base, 0, i); // copy from cb_mask_in[0] to cb_mask_in[i] - if (i == cur_pos_in_chunk_t - 1) { - noc_async_read_barrier(); - } - } - } else if (i == cur_pos_in_chunk_t) { - // fill with partial zero/-inf - fill_tile_partial(cb_mask_in, i, cur_pos_in_tile, NEG_INF); - } else { - // fill with -inf - if (i == cur_pos_in_chunk_t + 1) { - fill_tile(cb_mask_in, i, NEG_INF); - } else { - copy_tile( - noc_read_addr_base, - q_write_ptr_base, - cur_pos_in_chunk_t + 1, - i); // copy from cb_mask_in[cur_pos_in_chunk_t+1] to cb_mask_in[i] - if (i == Sk_chunk_t - 1) { - noc_async_read_barrier(); - } - } - } - for (uint32_t j = 1; j < PNHt; ++j) { - // copy from cb_mask_in[i] to cb_mask_in[j*Sk_chunk_t + i] - copy_tile(noc_read_addr_base, q_write_ptr_base, i, j * Sk_chunk_t + i); - if (j == PNHt - 1) { - noc_async_read_barrier(); - } - } - } - - cb_push_back(cb_mask_in, total_read_tiles); -} - -template < - uint32_t out_chunk_tiles, - uint32_t cb_out, - uint32_t cb_out_m, - uint32_t cb_out_l, - uint32_t cb_intermed_out, - uint32_t PNHt> -void worker_compute( - uint64_t in0_sender_semaphore_noc_addr, - uint32_t worker_id, - uint32_t reduce_core_noc_x, - uint32_t reduce_core_noc_y) { - uint32_t out_tile_id = 0; - - // Wait for compute to deliver output chunk - cb_wait_front(cb_out, out_chunk_tiles); - cb_wait_front(cb_out_m, PNHt); - cb_wait_front(cb_out_l, PNHt); - - // Write output chunk to reducer - constexpr uint32_t tile_bytes = get_tile_size(cb_out); - uint32_t worker_offset = worker_id * (out_chunk_tiles + 2 * PNHt) * tile_bytes; - constexpr uint32_t o_write_size = out_chunk_tiles * tile_bytes; - constexpr uint32_t ml_write_size = PNHt * tile_bytes; - uint64_t output_write_addr = - get_noc_addr(reduce_core_noc_x, reduce_core_noc_y, get_write_ptr(cb_intermed_out)) + worker_offset; - noc_async_write(get_read_ptr(cb_out), output_write_addr, o_write_size); - output_write_addr += o_write_size; - noc_async_write(get_read_ptr(cb_out_m), output_write_addr, ml_write_size); - output_write_addr += ml_write_size; - noc_async_write(get_read_ptr(cb_out_l), output_write_addr, ml_write_size); - - // increment semaphore - noc_async_write_barrier(); - noc_semaphore_inc(in0_sender_semaphore_noc_addr, 1); - - // pop front - cb_pop_front(cb_out, out_chunk_tiles); - cb_pop_front(cb_out_m, PNHt); - cb_pop_front(cb_out_l, PNHt); -} +#include "ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp" +#include "dataflow_common.hpp" void kernel_main() { constexpr uint32_t B = get_compile_time_arg_val(0); // batch size diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/rt_args_common.hpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp similarity index 100% rename from ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/rt_args_common.hpp rename to ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/rt_args_common.hpp From 3e9657669dc50d3588520107c50dca2882742c0e Mon Sep 17 00:00:00 2001 From: Allan Liu Date: Mon, 9 Dec 2024 19:07:22 +0000 Subject: [PATCH 38/59] Re-enable device test for N300 --- .../tt_metal/device/test_device_init_and_teardown.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp index c37e3e7c80b4..f668b6a269dc 100644 --- a/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp @@ -67,11 +67,6 @@ TEST_P(DeviceParamFixture, DeviceInitializeAndTeardown) { GTEST_SKIP(); } - // see issue #9594 - if (arch == tt::ARCH::WORMHOLE_B0 && num_devices > 1) { - GTEST_SKIP(); - } - ASSERT_TRUE(num_devices > 0); vector ids; for (unsigned int id = 0; id < num_devices; id++) { From 883bbf064bb4b7003ed51b01c8c0340b51c2a606 Mon Sep 17 00:00:00 2001 From: Radomir Djogo <159184120+rdjogoTT@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:07:50 -0500 Subject: [PATCH 39/59] Update Binary SFPU Add Int32 for new format (#15849) ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-metal/issues/15122) ### Problem description/What's changed Binary SFPU OP add int32 LLK is updated to work with int32 formatted in 2's complement. ### Checklist - [x] Post commit CI passes: https://github.com/tenstorrent/tt-metal/actions/runs/12261691466 - [x] Blackhole Post commit (if applicable): https://github.com/tenstorrent/tt-metal/actions/runs/12261694920 - [x] New/Existing tests provide coverage for changes - will be added in following PR --- .../llk_api/llk_sfpu/ckernel_sfpu_add_int32.h | 4 ++-- .../llk_math_eltwise_binary_sfpu_add_int32.h | 4 ++-- .../llk_api/llk_sfpu/ckernel_sfpu_add_int32.h | 4 ++-- .../llk_math_eltwise_binary_sfpu_add_int32.h | 4 ++-- .../include/compute_kernel_api/add_int32_sfpu.h | 17 ++++++++++------- tt_metal/third_party/tt_llk_blackhole | 2 +- tt_metal/third_party/tt_llk_wormhole_b0 | 2 +- 7 files changed, 20 insertions(+), 17 deletions(-) diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_add_int32.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_add_int32.h index fff976fbf0b2..ac685fb0d759 100644 --- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_add_int32.h +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_add_int32.h @@ -13,9 +13,9 @@ using namespace sfpi; namespace ckernel { namespace sfpu { -template +template inline void calculate_add_int32(const uint dst_offset) { - _add_int32_(dst_offset); + _add_int32_(dst_offset); } } // namespace sfpu diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_add_int32.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_add_int32.h index db9d25799560..907847502f9b 100644 --- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_add_int32.h +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_add_int32.h @@ -17,11 +17,11 @@ inline void llk_math_eltwise_binary_sfpu_add_int32_init() { llk_math_eltwise_binary_sfpu_init(); } -template +template inline void llk_math_eltwise_binary_sfpu_add_int32( uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { llk_math_eltwise_binary_sfpu_params( - ckernel::sfpu::calculate_add_int32, dst_index0, dst_index1, vector_mode); + ckernel::sfpu::calculate_add_int32, dst_index0, dst_index1, vector_mode); } } // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_add_int32.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_add_int32.h index fff976fbf0b2..ac685fb0d759 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_add_int32.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_add_int32.h @@ -13,9 +13,9 @@ using namespace sfpi; namespace ckernel { namespace sfpu { -template +template inline void calculate_add_int32(const uint dst_offset) { - _add_int32_(dst_offset); + _add_int32_(dst_offset); } } // namespace sfpu diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_add_int32.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_add_int32.h index db9d25799560..907847502f9b 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_add_int32.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_add_int32.h @@ -17,11 +17,11 @@ inline void llk_math_eltwise_binary_sfpu_add_int32_init() { llk_math_eltwise_binary_sfpu_init(); } -template +template inline void llk_math_eltwise_binary_sfpu_add_int32( uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { llk_math_eltwise_binary_sfpu_params( - ckernel::sfpu::calculate_add_int32, dst_index0, dst_index1, vector_mode); + ckernel::sfpu::calculate_add_int32, dst_index0, dst_index1, vector_mode); } } // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/add_int32_sfpu.h b/tt_metal/include/compute_kernel_api/add_int32_sfpu.h index 891035558210..4de5ee5b55af 100644 --- a/tt_metal/include/compute_kernel_api/add_int32_sfpu.h +++ b/tt_metal/include/compute_kernel_api/add_int32_sfpu.h @@ -24,15 +24,18 @@ namespace ckernel { * * Return value: None * - * | Argument | Description | Type | Valid Range | - * Required | - * |----------------|-----------------------------------------------------------------------|----------|-------------------------------------------------------|----------| - * | idst0 | The index of the tile in DST register buffer to use as first operand | uint32_t | Must be less - * than the size of the DST register buffer | True | | idst1 | The index of the tile in DST register buffer - * to use as second operand | uint32_t | Must be less than the size of the DST register buffer | True | + * | Argument | Description | Type | + * Valid Range | Required | + * |-----------------------|-----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst0 | The index of the tile in DST register buffer to use as first operand | uint32_t | + * Must be less than the size of the DST register buffer | True | | idst1 | The index of the tile in + * DST register buffer to use as second operand | uint32_t | Must be less than the size of the DST register buffer + * | True | | sign_magnitude_format | Whether the Int32 values are in sign-magnitude format (not 2's complement) | + * bool | | False | */ +template ALWI void add_int32_tile(uint32_t idst0, uint32_t idst1) { - MATH((llk_math_eltwise_binary_sfpu_add_int32(idst0, idst1))); + MATH((llk_math_eltwise_binary_sfpu_add_int32(idst0, idst1))); } /** diff --git a/tt_metal/third_party/tt_llk_blackhole b/tt_metal/third_party/tt_llk_blackhole index 8b5afa5b0f92..7536fbacd75a 160000 --- a/tt_metal/third_party/tt_llk_blackhole +++ b/tt_metal/third_party/tt_llk_blackhole @@ -1 +1 @@ -Subproject commit 8b5afa5b0f92841f13d49263482bdde6aaeef4ca +Subproject commit 7536fbacd75a4ad62047c63c9c54176fae079e06 diff --git a/tt_metal/third_party/tt_llk_wormhole_b0 b/tt_metal/third_party/tt_llk_wormhole_b0 index ed02df9eb4bb..0f57d4e9dec6 160000 --- a/tt_metal/third_party/tt_llk_wormhole_b0 +++ b/tt_metal/third_party/tt_llk_wormhole_b0 @@ -1 +1 @@ -Subproject commit ed02df9eb4bbfb37da1b9d9a8a129f1f6842a6cd +Subproject commit 0f57d4e9dec602b68671be8891e7af876285f275 From 208db3edff98f9aa5e2d67e475903c9669b8f164 Mon Sep 17 00:00:00 2001 From: Marko Bezulj <156311081+mbezuljTT@users.noreply.github.com> Date: Tue, 10 Dec 2024 22:23:32 +0100 Subject: [PATCH 40/59] Replace extern RunTimeOptions OptionG; with RunTimeOptions::get_instance() (#15860) ### Problem description > Note: global initialization order is non-deterministic > This is ok so long as this gets initialized before decisions are based on env stat I was hitting this issue in a unit test in tt-mlir repo. ### What's changed Replace extern RunTimeOptions OptionG; with RunTimeOptions::get_instance() to avoid edge-case bugs with order of initialization of static variables. ### Checklist - [x] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- ...ogram_with_kernel_path_env_var_fixture.hpp | 12 +- .../tt_metal/common/command_queue_fixture.hpp | 6 +- .../tt_metal/common/device_fixture.hpp | 2 +- .../tt_metal/common/dispatch_fixture.hpp | 4 +- .../tt_metal/common/multi_device_fixture.hpp | 2 +- .../debug_tools/debug_tools_fixture.hpp | 98 ++++++++--------- .../dprint/test_invalid_print_core.cpp | 4 +- .../dprint/test_print_all_harts.cpp | 18 +-- .../debug_tools/dprint/test_print_tiles.cpp | 18 +-- .../device/test_device_init_and_teardown.cpp | 4 +- .../tt_metal/device/test_device_pool.cpp | 8 +- .../dispatch/dispatch_event/test_events.cpp | 4 +- .../dispatch/multi_command_queue_fixture.hpp | 6 +- .../dispatch/test_dispatcher.cpp | 2 +- .../dispatch/test_pgm_dispatch.cpp | 4 +- .../dispatch/test_prefetcher.cpp | 2 +- .../routing/test_bi_tunnel.cpp | 2 +- .../routing/test_mux_demux.cpp | 2 +- .../routing/test_mux_demux_2level.cpp | 2 +- .../routing/test_tunnel_1cq.cpp | 2 +- .../routing/test_tunnel_2cq.cpp | 2 +- .../routing/test_tx_rx.cpp | 2 +- .../routing/test_uni_tunnel.cpp | 2 +- .../routing/test_uni_tunnel_single_chip.cpp | 2 +- .../routing/test_vc_bi_tunnel_2ep.cpp | 2 +- .../routing/test_vc_bi_tunnel_4ep.cpp | 2 +- .../routing/test_vc_loopback_tunnel.cpp | 2 +- .../routing/test_vc_mux_demux.cpp | 2 +- .../routing/test_vc_uni_tunnel.cpp | 2 +- tests/tt_metal/tt_metal/test_clean_init.cpp | 2 +- tests/tt_metal/tt_metal/test_compile_args.cpp | 2 +- tt_metal/common/utils.cpp | 2 +- tt_metal/impl/debug/dprint_server.cpp | 54 +++++---- tt_metal/impl/debug/noc_logging.cpp | 10 +- tt_metal/impl/debug/watcher_device_reader.cpp | 20 ++-- tt_metal/impl/debug/watcher_server.cpp | 33 +++--- tt_metal/impl/device/device.cpp | 17 +-- tt_metal/impl/device/device_pool.cpp | 2 +- tt_metal/impl/dispatch/command_queue.cpp | 8 +- tt_metal/impl/dispatch/data_collection.cpp | 6 +- tt_metal/impl/program/program.cpp | 4 +- tt_metal/jit_build/build.cpp | 49 +++++---- tt_metal/jit_build/build.hpp | 2 +- tt_metal/jit_build/genfiles.cpp | 8 +- tt_metal/llrt/llrt.cpp | 4 +- tt_metal/llrt/rtoptions.cpp | 103 +++++++++--------- tt_metal/llrt/rtoptions.hpp | 44 ++++---- tt_metal/llrt/tt_cluster.cpp | 9 +- tt_metal/tools/profiler/profiler.cpp | 4 +- tt_metal/tools/profiler/tt_metal_profiler.cpp | 6 +- tt_metal/tools/watcher_dump/watcher_dump.cpp | 8 +- tt_metal/tt_metal.cpp | 4 +- 52 files changed, 323 insertions(+), 298 deletions(-) diff --git a/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp index 524210625b0a..56db54c130b5 100644 --- a/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp +++ b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp @@ -38,18 +38,18 @@ class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test { } void setup_kernel_dir(const string& orig_kernel_file, const string& new_kernel_file) { - const string& kernel_dir = llrt::OptionsG.get_kernel_dir(); + const string& kernel_dir = llrt::RunTimeOptions::get_instance().get_kernel_dir(); const std::filesystem::path& kernel_file_path_under_kernel_dir(kernel_dir + new_kernel_file); const std::filesystem::path& dirs_under_kernel_dir = kernel_file_path_under_kernel_dir.parent_path(); std::filesystem::create_directories(dirs_under_kernel_dir); - const string& metal_root = llrt::OptionsG.get_root_dir(); + const string& metal_root = llrt::RunTimeOptions::get_instance().get_root_dir(); const std::filesystem::path& kernel_file_path_under_metal_root(metal_root + orig_kernel_file); std::filesystem::copy(kernel_file_path_under_metal_root, kernel_file_path_under_kernel_dir); } void cleanup_kernel_dir() { - const string& kernel_dir = llrt::OptionsG.get_kernel_dir(); + const string& kernel_dir = llrt::RunTimeOptions::get_instance().get_kernel_dir(); for (const std::filesystem::directory_entry& entry : std::filesystem::directory_iterator(kernel_dir)) { std::filesystem::remove_all(entry); } @@ -63,11 +63,11 @@ class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test { bool are_env_vars_set() { bool are_set = true; - if (!llrt::OptionsG.is_root_dir_specified()) { + if (!llrt::RunTimeOptions::get_instance().is_root_dir_specified()) { log_info(LogTest, "Skipping test: TT_METAL_HOME must be set"); are_set = false; } - if (!llrt::OptionsG.is_kernel_dir_specified()) { + if (!llrt::RunTimeOptions::get_instance().is_kernel_dir_specified()) { log_info(LogTest, "Skipping test: TT_METAL_KERNEL_PATH must be set"); are_set = false; } @@ -76,7 +76,7 @@ class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test { bool is_kernel_dir_valid() { bool is_valid = true; - const string& kernel_dir = llrt::OptionsG.get_kernel_dir(); + const string& kernel_dir = llrt::RunTimeOptions::get_instance().get_kernel_dir(); if (!this->does_path_exist(kernel_dir) || !this->is_path_a_directory(kernel_dir) || !this->is_dir_empty(kernel_dir)) { log_info(LogTest, "Skipping test: TT_METAL_KERNEL_PATH must be an existing, empty directory"); diff --git a/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp index b51338d1d0f8..efec2c625342 100644 --- a/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp @@ -44,7 +44,7 @@ class CommandQueueFixture : public DispatchFixture { void create_device(const size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE) { const chip_id_t device_id = 0; - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); this->device_ = tt::tt_metal::CreateDevice(device_id, 1, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_config); } @@ -88,7 +88,7 @@ class CommandQueueSingleCardFixture : virtual public DispatchFixture { } void create_devices(const std::size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE) { - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); const chip_id_t mmio_device_id = 0; this->reserved_devices_ = tt::tt_metal::detail::CreateDevices( {mmio_device_id}, 1, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_config); @@ -143,7 +143,7 @@ class CommandQueueMultiDeviceFixture : public DispatchFixture { chip_ids.push_back(id); } - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); reserved_devices_ = tt::tt_metal::detail::CreateDevices( chip_ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); for (const auto& [id, device] : reserved_devices_) { diff --git a/tests/tt_metal/tt_metal/common/device_fixture.hpp b/tests/tt_metal/tt_metal/common/device_fixture.hpp index 17dff1caa077..7de22aeb4914 100644 --- a/tests/tt_metal/tt_metal/common/device_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/device_fixture.hpp @@ -44,7 +44,7 @@ class DeviceFixture : public DispatchFixture { } void create_devices(const std::vector& device_ids) { - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); tt::DevicePool::initialize( device_ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); this->devices_ = tt::DevicePool::instance().get_all_active_devices(); diff --git a/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp index 57bfbcdb934d..c896b949c65c 100644 --- a/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp @@ -75,10 +75,10 @@ class DispatchFixture : public ::testing::Test { } ids.push_back(id); } - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); tt::DevicePool::initialize( ids, - tt::llrt::OptionsG.get_num_hw_cqs(), + tt::llrt::RunTimeOptions::get_instance().get_num_hw_cqs(), DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp index 21cf4dc2943b..f878519b1d7c 100644 --- a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp +++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp @@ -38,7 +38,7 @@ class N300DeviceFixture : public MultiDeviceFixture { ids.push_back(id); } - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); this->devices_ = tt::DevicePool::instance().get_all_active_devices(); } else { diff --git a/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp b/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp index f8189d9c98e9..9d4f67142584 100644 --- a/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp +++ b/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp @@ -15,7 +15,7 @@ class DebugToolsFixture : public DispatchFixture { void TearDown() override { DispatchFixture::TearDown(); - tt::llrt::OptionsG.set_watcher_enabled(watcher_previous_enabled); + tt::llrt::RunTimeOptions::get_instance().set_watcher_enabled(watcher_previous_enabled); } template @@ -45,17 +45,17 @@ class DPrintFixture : public DebugToolsFixture { // The core range (physical) needs to be set >= the set of all cores // used by all tests using this fixture, so set dprint enabled for // all cores and all devices - tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true); - tt::llrt::OptionsG.set_feature_all_cores( + tt::llrt::RunTimeOptions::get_instance().set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true); + tt::llrt::RunTimeOptions::get_instance().set_feature_all_cores( tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassWorker); - tt::llrt::OptionsG.set_feature_all_cores( + tt::llrt::RunTimeOptions::get_instance().set_feature_all_cores( tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassWorker); - tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, true); + tt::llrt::RunTimeOptions::get_instance().set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, true); // Send output to a file so the test can check after program is run. - tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, dprint_file_name); - tt::llrt::OptionsG.set_test_mode_enabled(true); - watcher_previous_enabled = tt::llrt::OptionsG.get_watcher_enabled(); - tt::llrt::OptionsG.set_watcher_enabled(false); + tt::llrt::RunTimeOptions::get_instance().set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, dprint_file_name); + tt::llrt::RunTimeOptions::get_instance().set_test_mode_enabled(true); + watcher_previous_enabled = tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled(); + tt::llrt::RunTimeOptions::get_instance().set_watcher_enabled(false); ExtraSetUp(); @@ -71,15 +71,15 @@ class DPrintFixture : public DebugToolsFixture { std::remove(dprint_file_name.c_str()); // Reset DPrint settings - tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, {}); - tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false); - tt::llrt::OptionsG.set_feature_all_cores( + tt::llrt::RunTimeOptions::get_instance().set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, {}); + tt::llrt::RunTimeOptions::get_instance().set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false); + tt::llrt::RunTimeOptions::get_instance().set_feature_all_cores( tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassNoneSpecified); - tt::llrt::OptionsG.set_feature_all_cores( + tt::llrt::RunTimeOptions::get_instance().set_feature_all_cores( tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassNoneSpecified); - tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false); - tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, ""); - tt::llrt::OptionsG.set_test_mode_enabled(false); + tt::llrt::RunTimeOptions::get_instance().set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false); + tt::llrt::RunTimeOptions::get_instance().set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, ""); + tt::llrt::RunTimeOptions::get_instance().set_test_mode_enabled(false); } void RunTestOnDevice( @@ -101,8 +101,8 @@ class DPrintDisableDevicesFixture : public DPrintFixture { protected: void ExtraSetUp() override { // For this test, mute each devices using the environment variable - tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false); - tt::llrt::OptionsG.set_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint, {}); + tt::llrt::RunTimeOptions::get_instance().set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false); + tt::llrt::RunTimeOptions::get_instance().set_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint, {}); } }; @@ -135,20 +135,20 @@ class WatcherFixture : public DebugToolsFixture { bool test_mode_previous; void SetUp() override { // Enable watcher for this test, save the previous state so we can restore it later. - watcher_previous_enabled = tt::llrt::OptionsG.get_watcher_enabled(); - watcher_previous_interval = tt::llrt::OptionsG.get_watcher_interval(); - watcher_previous_dump_all = tt::llrt::OptionsG.get_watcher_dump_all(); - watcher_previous_append = tt::llrt::OptionsG.get_watcher_append(); - watcher_previous_auto_unpause = tt::llrt::OptionsG.get_watcher_auto_unpause(); - watcher_previous_noinline = tt::llrt::OptionsG.get_watcher_noinline(); - test_mode_previous = tt::llrt::OptionsG.get_test_mode_enabled(); - tt::llrt::OptionsG.set_watcher_enabled(true); - tt::llrt::OptionsG.set_watcher_interval(interval_ms); - tt::llrt::OptionsG.set_watcher_dump_all(false); - tt::llrt::OptionsG.set_watcher_append(false); - tt::llrt::OptionsG.set_watcher_auto_unpause(true); - tt::llrt::OptionsG.set_watcher_noinline(true); - tt::llrt::OptionsG.set_test_mode_enabled(true); + watcher_previous_enabled = tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled(); + watcher_previous_interval = tt::llrt::RunTimeOptions::get_instance().get_watcher_interval(); + watcher_previous_dump_all = tt::llrt::RunTimeOptions::get_instance().get_watcher_dump_all(); + watcher_previous_append = tt::llrt::RunTimeOptions::get_instance().get_watcher_append(); + watcher_previous_auto_unpause = tt::llrt::RunTimeOptions::get_instance().get_watcher_auto_unpause(); + watcher_previous_noinline = tt::llrt::RunTimeOptions::get_instance().get_watcher_noinline(); + test_mode_previous = tt::llrt::RunTimeOptions::get_instance().get_test_mode_enabled(); + tt::llrt::RunTimeOptions::get_instance().set_watcher_enabled(true); + tt::llrt::RunTimeOptions::get_instance().set_watcher_interval(interval_ms); + tt::llrt::RunTimeOptions::get_instance().set_watcher_dump_all(false); + tt::llrt::RunTimeOptions::get_instance().set_watcher_append(false); + tt::llrt::RunTimeOptions::get_instance().set_watcher_auto_unpause(true); + tt::llrt::RunTimeOptions::get_instance().set_watcher_noinline(true); + tt::llrt::RunTimeOptions::get_instance().set_test_mode_enabled(true); tt::watcher_clear_log(); // Parent class initializes devices and any necessary flags @@ -160,12 +160,12 @@ class WatcherFixture : public DebugToolsFixture { DebugToolsFixture::TearDown(); // Reset watcher settings to their previous values - tt::llrt::OptionsG.set_watcher_interval(watcher_previous_interval); - tt::llrt::OptionsG.set_watcher_dump_all(watcher_previous_dump_all); - tt::llrt::OptionsG.set_watcher_append(watcher_previous_append); - tt::llrt::OptionsG.set_watcher_auto_unpause(watcher_previous_auto_unpause); - tt::llrt::OptionsG.set_watcher_noinline(watcher_previous_noinline); - tt::llrt::OptionsG.set_test_mode_enabled(test_mode_previous); + tt::llrt::RunTimeOptions::get_instance().set_watcher_interval(watcher_previous_interval); + tt::llrt::RunTimeOptions::get_instance().set_watcher_dump_all(watcher_previous_dump_all); + tt::llrt::RunTimeOptions::get_instance().set_watcher_append(watcher_previous_append); + tt::llrt::RunTimeOptions::get_instance().set_watcher_auto_unpause(watcher_previous_auto_unpause); + tt::llrt::RunTimeOptions::get_instance().set_watcher_noinline(watcher_previous_noinline); + tt::llrt::RunTimeOptions::get_instance().set_test_mode_enabled(test_mode_previous); tt::watcher_server_set_error_flag(false); } @@ -188,19 +188,19 @@ class WatcherDelayFixture : public WatcherFixture { std::map> delayed_cores; void SetUp() override { - tt::llrt::OptionsG.set_watcher_debug_delay(5000000); + tt::llrt::RunTimeOptions::get_instance().set_watcher_debug_delay(5000000); delayed_cores[CoreType::WORKER] = {{0, 0}, {1, 1}}; // Store the previous state of the watcher features - saved_target_selection[tt::llrt::RunTimeDebugFeatureReadDebugDelay] = tt::llrt::OptionsG.get_feature_targets(tt::llrt::RunTimeDebugFeatureReadDebugDelay); - saved_target_selection[tt::llrt::RunTimeDebugFeatureWriteDebugDelay] = tt::llrt::OptionsG.get_feature_targets(tt::llrt::RunTimeDebugFeatureWriteDebugDelay); - saved_target_selection[tt::llrt::RunTimeDebugFeatureAtomicDebugDelay] = tt::llrt::OptionsG.get_feature_targets(tt::llrt::RunTimeDebugFeatureAtomicDebugDelay); + saved_target_selection[tt::llrt::RunTimeDebugFeatureReadDebugDelay] = tt::llrt::RunTimeOptions::get_instance().get_feature_targets(tt::llrt::RunTimeDebugFeatureReadDebugDelay); + saved_target_selection[tt::llrt::RunTimeDebugFeatureWriteDebugDelay] = tt::llrt::RunTimeOptions::get_instance().get_feature_targets(tt::llrt::RunTimeDebugFeatureWriteDebugDelay); + saved_target_selection[tt::llrt::RunTimeDebugFeatureAtomicDebugDelay] = tt::llrt::RunTimeOptions::get_instance().get_feature_targets(tt::llrt::RunTimeDebugFeatureAtomicDebugDelay); // Enable read and write debug delay for the test core - tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureReadDebugDelay, true); - tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureReadDebugDelay, delayed_cores); - tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, true); - tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, delayed_cores); + tt::llrt::RunTimeOptions::get_instance().set_feature_enabled(tt::llrt::RunTimeDebugFeatureReadDebugDelay, true); + tt::llrt::RunTimeOptions::get_instance().set_feature_cores(tt::llrt::RunTimeDebugFeatureReadDebugDelay, delayed_cores); + tt::llrt::RunTimeOptions::get_instance().set_feature_enabled(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, true); + tt::llrt::RunTimeOptions::get_instance().set_feature_cores(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, delayed_cores); // Call parent WatcherFixture::SetUp(); @@ -211,8 +211,8 @@ class WatcherDelayFixture : public WatcherFixture { WatcherFixture::TearDown(); // Restore - tt::llrt::OptionsG.set_feature_targets(tt::llrt::RunTimeDebugFeatureReadDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureReadDebugDelay]); - tt::llrt::OptionsG.set_feature_targets(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureWriteDebugDelay]); - tt::llrt::OptionsG.set_feature_targets(tt::llrt::RunTimeDebugFeatureAtomicDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureAtomicDebugDelay]); + tt::llrt::RunTimeOptions::get_instance().set_feature_targets(tt::llrt::RunTimeDebugFeatureReadDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureReadDebugDelay]); + tt::llrt::RunTimeOptions::get_instance().set_feature_targets(tt::llrt::RunTimeDebugFeatureWriteDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureWriteDebugDelay]); + tt::llrt::RunTimeOptions::get_instance().set_feature_targets(tt::llrt::RunTimeDebugFeatureAtomicDebugDelay, saved_target_selection[tt::llrt::RunTimeDebugFeatureAtomicDebugDelay]); } }; diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp index 47ba5193765f..ba54542237c6 100644 --- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp @@ -16,12 +16,12 @@ TEST_F(DPrintFixture, TensixTestPrintInvalidCore) { // device setup, but not the print server should simply ignore the invalid cores. std::map> dprint_cores; dprint_cores[CoreType::WORKER] = {{0, 0}, {1, 1}, {100, 100}}; - tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, dprint_cores); + tt::llrt::RunTimeOptions::get_instance().set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, dprint_cores); // We expect that even though illegal worker cores were requested, device setup did not hang. // So just make sure that device setup worked and then close the device. for (Device* device : this->devices_) { EXPECT_TRUE(device != nullptr); } - tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false); + tt::llrt::RunTimeOptions::get_instance().set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false); } diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp index f45111887e34..420f2abc0c37 100644 --- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp @@ -161,22 +161,22 @@ static void RunTest(DPrintFixture* fixture, Device* device) { // failing test cases, although all three kernels simply print. KernelHandle brisc_print_kernel_id = CreateKernel( program, - llrt::OptionsG.get_root_dir() + "tests/tt_metal/tt_metal/test_kernels/misc/brisc_print.cpp", + llrt::RunTimeOptions::get_instance().get_root_dir() + + "tests/tt_metal/tt_metal/test_kernels/misc/brisc_print.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default} - ); + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); KernelHandle ncrisc_print_kernel_id = CreateKernel( program, - llrt::OptionsG.get_root_dir() + "tests/tt_metal/tt_metal/test_kernels/misc/ncrisc_print.cpp", + llrt::RunTimeOptions::get_instance().get_root_dir() + + "tests/tt_metal/tt_metal/test_kernels/misc/ncrisc_print.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default} - ); + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); KernelHandle trisc_print_kernel_id = CreateKernel( program, - llrt::OptionsG.get_root_dir() + "tests/tt_metal/tt_metal/test_kernels/misc/trisc_print.cpp", + llrt::RunTimeOptions::get_instance().get_root_dir() + + "tests/tt_metal/tt_metal/test_kernels/misc/trisc_print.cpp", core, - ComputeConfig{} - ); + ComputeConfig{}); // Run the program fixture->RunProgram(device, program); diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp index e324a6f44bc3..2e33cb077dcd 100644 --- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp @@ -124,22 +124,22 @@ static void RunTest(DPrintFixture* fixture, Device* device, tt::DataFormat data_ // Create kernels on device KernelHandle brisc_print_kernel_id = CreateKernel( program, - llrt::OptionsG.get_root_dir() + "tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp", + llrt::RunTimeOptions::get_instance().get_root_dir() + + "tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default} - ); + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); KernelHandle ncrisc_print_kernel_id = CreateKernel( program, - llrt::OptionsG.get_root_dir() + "tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp", + llrt::RunTimeOptions::get_instance().get_root_dir() + + "tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default} - ); + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); KernelHandle trisc_print_kernel_id = CreateKernel( program, - llrt::OptionsG.get_root_dir() + "tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp", + llrt::RunTimeOptions::get_instance().get_root_dir() + + "tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp", core, - ComputeConfig{} - ); + ComputeConfig{}); // BRISC kernel needs dram info via rtargs tt_metal::SetRuntimeArgs(program, brisc_print_kernel_id, core, {dram_buffer_src_addr, (std::uint32_t)0}); diff --git a/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp index f668b6a269dc..04aa4b22d8a1 100644 --- a/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp @@ -72,7 +72,7 @@ TEST_P(DeviceParamFixture, DeviceInitializeAndTeardown) { for (unsigned int id = 0; id < num_devices; id++) { ids.push_back(id); } - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); const auto devices = tt::DevicePool::instance().get_all_active_devices(); for (auto device : devices) { @@ -91,7 +91,7 @@ TEST_P(DeviceParamFixture, TensixDeviceLoadBlankKernels) { for (unsigned int id = 0; id < num_devices; id++) { ids.push_back(id); } - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); const auto devices = tt::DevicePool::instance().get_all_active_devices(); for (auto device : devices) { diff --git a/tests/tt_metal/tt_metal/device/test_device_pool.cpp b/tests/tt_metal/tt_metal/device/test_device_pool.cpp index 49d466cff5f2..9c204fc7abdb 100644 --- a/tests/tt_metal/tt_metal/device/test_device_pool.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_pool.cpp @@ -13,7 +13,7 @@ TEST(DevicePool, DevicePoolOpenClose) { std::vector device_ids{0}; int num_hw_cqs = 1; int l1_small_size = 1024; - const auto& dispatch_core_config = llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); auto devices = DevicePool::instance().get_all_active_devices(); for (const auto& dev : devices) { @@ -41,7 +41,7 @@ TEST(DevicePool, DevicePoolReconfigDevices) { std::vector device_ids{0}; int num_hw_cqs = 1; int l1_small_size = 1024; - const auto& dispatch_core_config = llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); auto devices = DevicePool::instance().get_all_active_devices(); for (const auto& dev : devices) { @@ -73,7 +73,7 @@ TEST(DevicePool, DevicePoolAddDevices) { std::vector device_ids{0}; int num_hw_cqs = 1; int l1_small_size = 1024; - const auto& dispatch_core_config = llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); auto devices = DevicePool::instance().get_all_active_devices(); for (const auto& dev : devices) { @@ -107,7 +107,7 @@ TEST(DevicePool, DevicePoolReduceDevices) { std::vector device_ids{0, 1, 2, 3}; int num_hw_cqs = 1; int l1_small_size = 1024; - const auto& dispatch_core_config = llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); const auto devices = DevicePool::instance().get_all_active_devices(); for (const auto& dev : devices) { diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp index 9880ceb375cd..c0f28c44dbe8 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp @@ -122,7 +122,7 @@ TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventAndSynchronize) { // Negative test. Host syncing on a future event that isn't actually issued. // Ensure that expected hang is seen, which indicates event sync feature is working properly. TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) { - tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout. + tt::llrt::RunTimeOptions::get_instance().set_test_mode_enabled(true); // Required for finish hang breakout. auto future_event = std::make_shared(); EnqueueRecordEvent(this->device_->command_queue(), future_event); @@ -153,7 +153,7 @@ TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) TEST_F(CommandQueueEventFixture, TestEventsQueueWaitForEventHang) { // Skip this test until #7216 is implemented. GTEST_SKIP(); - tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout. + tt::llrt::RunTimeOptions::get_instance().set_test_mode_enabled(true); // Required for finish hang breakout. auto future_event = std::make_shared(); EnqueueRecordEvent(this->device_->command_queue(), future_event); diff --git a/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp index 75c4c3f5cca8..fa9c5cb73bd6 100644 --- a/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp +++ b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp @@ -22,7 +22,7 @@ class MultiCommandQueueSingleDeviceFixture : public DispatchFixture { void SetUp() override { this->validate_dispatch_mode(); - this->num_cqs_ = tt::llrt::OptionsG.get_num_hw_cqs(); + this->num_cqs_ = tt::llrt::RunTimeOptions::get_instance().get_num_hw_cqs(); if (this->num_cqs_ != 2) { tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); GTEST_SKIP(); @@ -88,7 +88,7 @@ class MultiCommandQueueSingleDeviceTraceFixture : public MultiCommandQueueSingle void SetUp() override { this->validate_dispatch_mode(); - this->num_cqs_ = tt::llrt::OptionsG.get_num_hw_cqs(); + this->num_cqs_ = tt::llrt::RunTimeOptions::get_instance().get_num_hw_cqs(); if (this->num_cqs_ != 2) { tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); GTEST_SKIP(); @@ -118,7 +118,7 @@ class MultiCommandQueueMultiDeviceFixture : public DispatchFixture { GTEST_SKIP(); } - auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs(); + auto num_cqs = tt::llrt::RunTimeOptions::get_instance().get_num_hw_cqs(); if (num_cqs != 2) { tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2"); GTEST_SKIP(); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp index d22dd0bb90ee..0f54b3abb638 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp @@ -688,7 +688,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "test_dispatcher.cpp - Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index 22127a8fb8a7..88ba15ff0030 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -253,7 +253,7 @@ void initialize_program(tt_metal::Device* device, tt_metal::Program& program, ui int main(int argc, char** argv) { init(argc, argv); - tt::llrt::OptionsG.set_kernels_nullified(true); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(true); bool pass = true; try { @@ -344,7 +344,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index 1b6748e7768f..06ca4f3384c2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -3492,7 +3492,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "test_prefetcher.cpp - Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp index afbd7ae3038f..5c798d609e14 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp @@ -713,7 +713,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp index 10742be3d24a..7a92606736a6 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp @@ -578,7 +578,7 @@ int main(int argc, char **argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp index 5a0f0c198612..13810530fdf6 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp @@ -595,7 +595,7 @@ int main(int argc, char **argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp index 547472f20f13..f883facbe080 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp @@ -820,7 +820,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp index 5562d50100d7..b455914329ee 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp @@ -836,7 +836,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp index 37ba0a6f1a70..09de8d75e784 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp @@ -244,7 +244,7 @@ int main(int argc, char **argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp index b48b418822a5..ce36770ab4d8 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp @@ -635,7 +635,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp index 6c084f08576d..aed8e50c833a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp @@ -623,7 +623,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp index b6299a066d73..c0594af1ff9f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp @@ -1406,7 +1406,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp index 05a97b6c3f2b..ab830d20bb83 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp @@ -1544,7 +1544,7 @@ int main(int argc, char** argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp index 68d308607f05..03119c6e726c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp @@ -1011,7 +1011,7 @@ int main(int argc, char **argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp index 85127049245e..afed803037c3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp @@ -619,7 +619,7 @@ int main(int argc, char **argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp index 22022fb90c0b..302de15eb4c8 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp @@ -813,7 +813,7 @@ int main(int argc, char **argv) { log_fatal(e.what()); } - tt::llrt::OptionsG.set_kernels_nullified(false); + tt::llrt::RunTimeOptions::get_instance().set_kernels_nullified(false); if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/test_clean_init.cpp b/tests/tt_metal/tt_metal/test_clean_init.cpp index cd3118ec3e89..b0f420be992a 100644 --- a/tests/tt_metal/tt_metal/test_clean_init.cpp +++ b/tests/tt_metal/tt_metal/test_clean_init.cpp @@ -37,7 +37,7 @@ int main(int argc, char** argv) { ids.push_back(id); } - const auto& dispatch_core_config = tt::llrt::OptionsG.get_dispatch_core_config(); + const auto& dispatch_core_config = tt::llrt::RunTimeOptions::get_instance().get_dispatch_core_config(); tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_config); const auto devices = tt::DevicePool::instance().get_all_active_devices(); diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp index 5a00d6dd9cf1..1258dfff522b 100644 --- a/tests/tt_metal/tt_metal/test_compile_args.cpp +++ b/tests/tt_metal/tt_metal/test_compile_args.cpp @@ -80,7 +80,7 @@ int main(int argc, char** argv) { std::distance(std::filesystem::directory_iterator(binary_path), std::filesystem::directory_iterator{}); TT_FATAL(num_built_kernels == 2, "Expected compute kernel test_compile_args to be compiled twice!"); - if (tt::llrt::OptionsG.get_watcher_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { // Test that the kernel_args.csv file was generated for both kernels log_info(LogTest, "Test kernel args logging"); auto kernel_args_path = binary_path.parent_path() / "kernel_args.csv"; diff --git a/tt_metal/common/utils.cpp b/tt_metal/common/utils.cpp index be7b8dbad938..20d7a5cca8fa 100644 --- a/tt_metal/common/utils.cpp +++ b/tt_metal/common/utils.cpp @@ -48,7 +48,7 @@ void create_file(const string& file_path_str) { const std::string& get_reports_dir() { static std::string outpath; if (outpath == "") { - outpath = tt::llrt::OptionsG.get_root_dir() + "/generated/reports/"; + outpath = tt::llrt::RunTimeOptions::get_instance().get_root_dir() + "/generated/reports/"; } return outpath; } diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp index 7a72c8758b6b..d0e62e692102 100644 --- a/tt_metal/impl/debug/dprint_server.cpp +++ b/tt_metal/impl/debug/dprint_server.cpp @@ -460,9 +460,12 @@ DebugPrintServerContext::DebugPrintServerContext() { inst = this; // Read hart mask + log file from rtoptions - uint32_t hart_mask = tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDprint); - string file_name = tt::llrt::OptionsG.get_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint); - bool one_file_per_risc = tt::llrt::OptionsG.get_feature_one_file_per_risc(tt::llrt::RunTimeDebugFeatureDprint); + uint32_t hart_mask = + tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDprint); + string file_name = + tt::llrt::RunTimeOptions::get_instance().get_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint); + bool one_file_per_risc = + tt::llrt::RunTimeOptions::get_instance().get_feature_one_file_per_risc(tt::llrt::RunTimeDebugFeatureDprint); // One file per risc auto-generates the output files and ignores the env var for it. Print a warning if both are // specified just in case. @@ -473,7 +476,7 @@ DebugPrintServerContext::DebugPrintServerContext() { } // Set the output stream according to RTOptions, either a file name or stdout if none specified. - std::filesystem::path output_dir(tt::llrt::OptionsG.get_root_dir() + logfile_path); + std::filesystem::path output_dir(tt::llrt::RunTimeOptions::get_instance().get_root_dir() + logfile_path); std::filesystem::create_directories(output_dir); if (file_name != "" && !one_file_per_risc) { outfile_ = new ofstream(file_name); @@ -559,8 +562,9 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // If RTOptions doesn't enable DPRINT on this device, return here and don't actually attach it // to the server. - std::vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint); - if (!tt::llrt::OptionsG.get_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint)) { + std::vector chip_ids = + tt::llrt::RunTimeOptions::get_instance().get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint); + if (!tt::llrt::RunTimeOptions::get_instance().get_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint)) { if (std::find(chip_ids.begin(), chip_ids.end(), device->id()) == chip_ids.end()) { return; } @@ -569,8 +573,8 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // Core range depends on whether dprint_all_cores flag is set. std::vector print_cores_sanitized; for (CoreType core_type : {CoreType::WORKER, CoreType::ETH}) { - if (tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) == - tt::llrt::RunTimeDebugClassAll) { + if (tt::llrt::RunTimeOptions::get_instance().get_feature_all_cores( + tt::llrt::RunTimeDebugFeatureDprint, core_type) == tt::llrt::RunTimeDebugClassAll) { // Print from all cores of the given type, cores returned here are guaranteed to be valid. for (CoreDescriptor logical_core : all_cores) { if (logical_core.type == core_type) { @@ -583,8 +587,8 @@ void DebugPrintServerContext::AttachDevice(Device* device) { device->id(), tt::llrt::get_core_type_name(core_type)); } else if ( - tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) == - tt::llrt::RunTimeDebugClassDispatch) { + tt::llrt::RunTimeOptions::get_instance().get_feature_all_cores( + tt::llrt::RunTimeDebugFeatureDprint, core_type) == tt::llrt::RunTimeDebugClassDispatch) { for (CoreDescriptor logical_core : dispatch_cores) { if (logical_core.type == core_type) { print_cores_sanitized.push_back(logical_core); @@ -596,8 +600,8 @@ void DebugPrintServerContext::AttachDevice(Device* device) { device->id(), tt::llrt::get_core_type_name(core_type)); } else if ( - tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) == - tt::llrt::RunTimeDebugClassWorker) { + tt::llrt::RunTimeOptions::get_instance().get_feature_all_cores( + tt::llrt::RunTimeDebugFeatureDprint, core_type) == tt::llrt::RunTimeDebugClassWorker) { // For worker cores, take all cores and remove dispatch cores. for (CoreDescriptor logical_core : all_cores) { if (dispatch_cores.find(logical_core) == dispatch_cores.end()) { @@ -613,8 +617,8 @@ void DebugPrintServerContext::AttachDevice(Device* device) { tt::llrt::get_core_type_name(core_type)); } else { // No "all cores" option provided, which means print from the cores specified by the user - std::vector& print_cores = - tt::llrt::OptionsG.get_feature_cores(tt::llrt::RunTimeDebugFeatureDprint)[core_type]; + std::vector& print_cores = tt::llrt::RunTimeOptions::get_instance().get_feature_cores( + tt::llrt::RunTimeDebugFeatureDprint)[core_type]; // We should also validate that the cores the user specified are valid worker cores. for (auto& logical_core : print_cores) { @@ -651,7 +655,8 @@ void DebugPrintServerContext::AttachDevice(Device* device) { } // Write print enable magic for the cores the user specified. - uint32_t hart_mask = tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDprint); + uint32_t hart_mask = + tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDprint); for (auto& logical_core : print_cores_sanitized) { CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { @@ -682,8 +687,9 @@ void DebugPrintServerContext::AttachDevice(Device* device) { void DebugPrintServerContext::DetachDevice(Device* device) { // Don't detach the device if it's disabled by env vars - in this case it wasn't attached. - std::vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint); - if (!tt::llrt::OptionsG.get_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint)) { + std::vector chip_ids = + tt::llrt::RunTimeOptions::get_instance().get_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint); + if (!tt::llrt::RunTimeOptions::get_instance().get_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint)) { if (std::find(chip_ids.begin(), chip_ids.end(), device->id()) == chip_ids.end()) { return; } @@ -691,7 +697,8 @@ void DebugPrintServerContext::DetachDevice(Device* device) { // When we detach a device, we should poll to make sure there's no outstanding prints. chip_id_t chip_id = device->id(); - uint32_t risc_mask = tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDprint); + uint32_t risc_mask = + tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDprint); bool outstanding_prints = true; while (outstanding_prints && !server_killed_due_to_hang_) { // Polling interval of 1ms @@ -776,7 +783,8 @@ void DebugPrintServerContext::ClearLogFile() { outfile_->close(); delete outfile_; - string file_name = tt::llrt::OptionsG.get_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint); + string file_name = + tt::llrt::RunTimeOptions::get_instance().get_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint); outfile_ = new ofstream(file_name); stream_ = outfile_ ? outfile_ : &cout; } @@ -1139,7 +1147,7 @@ void DebugPrintServerContext::PollPrintData(uint32_t hart_mask) { } catch (std::runtime_error& e) { // Depending on if test mode is enabled, catch and stop server, or // re-throw the exception. - if (tt::llrt::OptionsG.get_test_mode_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_test_mode_enabled()) { server_killed_due_to_hang_ = true; device_to_core_range_lock_.unlock(); return; // Stop the print loop @@ -1190,12 +1198,12 @@ void DebugPrintServerContext::TransferToAndFlushOutputStream( ostream* DebugPrintServerContext::GetOutputStream(const HartKey& hart_key) { ostream* output_stream = stream_; - if (tt::llrt::OptionsG.get_feature_one_file_per_risc(tt::llrt::RunTimeDebugFeatureDprint)) { + if (tt::llrt::RunTimeOptions::get_instance().get_feature_one_file_per_risc(tt::llrt::RunTimeDebugFeatureDprint)) { if (!risc_to_file_stream_[hart_key]) { const chip_id_t chip_id = get<0>(hart_key); const CoreDescriptor& logical_core = get<1>(hart_key); const int hart_id = get<2>(hart_key); - string filename = tt::llrt::OptionsG.get_root_dir() + logfile_path; + string filename = tt::llrt::RunTimeOptions::get_instance().get_root_dir() + logfile_path; filename += fmt::format( "device-{}_{}-core-{}-{}_{}.txt", chip_id, @@ -1225,7 +1233,7 @@ namespace tt { void DprintServerAttach(Device* device) { // Skip if DPRINT not enabled, and make sure profiler is not running. - if (!tt::llrt::OptionsG.get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint)) { + if (!tt::llrt::RunTimeOptions::get_instance().get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint)) { return; } TT_FATAL( diff --git a/tt_metal/impl/debug/noc_logging.cpp b/tt_metal/impl/debug/noc_logging.cpp index ea14307f2379..5857f61809f7 100644 --- a/tt_metal/impl/debug/noc_logging.cpp +++ b/tt_metal/impl/debug/noc_logging.cpp @@ -24,9 +24,9 @@ namespace tt { static string logfile_path = "generated/noc_data/"; void PrintNocData(noc_data_t noc_data, const string& file_name) { - std::filesystem::path output_dir(tt::llrt::OptionsG.get_root_dir() + logfile_path); + std::filesystem::path output_dir(tt::llrt::RunTimeOptions::get_instance().get_root_dir() + logfile_path); std::filesystem::create_directories(output_dir); - std::string filename = tt::llrt::OptionsG.get_root_dir() + logfile_path + file_name; + std::string filename = tt::llrt::RunTimeOptions::get_instance().get_root_dir() + logfile_path + file_name; std::ofstream outfile(filename); for (uint32_t idx = 0; idx < NOC_DATA_SIZE; idx++) { @@ -71,7 +71,7 @@ void DumpDeviceNocData(Device* device, noc_data_t& noc_data, noc_data_t& dispatc void DumpNocData(const std::vector& devices) { // Skip if feature is not enabled - if (!tt::llrt::OptionsG.get_record_noc_transfers()) { + if (!tt::llrt::RunTimeOptions::get_instance().get_record_noc_transfers()) { return; } @@ -87,13 +87,13 @@ void DumpNocData(const std::vector& devices) { void ClearNocData(Device* device) { // Skip if feature is not enabled - if (!tt::llrt::OptionsG.get_record_noc_transfers()) { + if (!tt::llrt::RunTimeOptions::get_instance().get_record_noc_transfers()) { return; } // This feature is incomatible with dprint since they share memory space TT_FATAL( - tt::llrt::OptionsG.get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint) == false, + tt::llrt::RunTimeOptions::get_instance().get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint) == false, "NOC transfer recording is incompatible with DPRINT"); CoreDescriptorSet all_cores = GetAllCores(device); diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp index c03b7548b79c..b12921584bb3 100644 --- a/tt_metal/impl/debug/watcher_device_reader.cpp +++ b/tt_metal/impl/debug/watcher_device_reader.cpp @@ -139,7 +139,7 @@ WatcherDeviceReader::WatcherDeviceReader( FILE* f, Device* device, std::vector& kernel_names, void (*set_watcher_exception_message)(const string&)) : f(f), device(device), kernel_names(kernel_names), set_watcher_exception_message(set_watcher_exception_message) { // On init, read out eth link retraining register so that we can see if retraining has occurred. WH only for now. - if (device->arch() == ARCH::WORMHOLE_B0 && tt::llrt::OptionsG.get_watcher_enabled()) { + if (device->arch() == ARCH::WORMHOLE_B0 && tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { std::vector read_data; for (const CoreCoord& eth_core : device->get_active_ethernet_cores()) { CoreCoord phys_core = device->ethernet_core_from_logical_core(eth_core); @@ -152,7 +152,7 @@ WatcherDeviceReader::WatcherDeviceReader( WatcherDeviceReader::~WatcherDeviceReader() { // On close, read out eth link retraining register so that we can see if retraining has occurred. - if (device->arch() == ARCH::WORMHOLE_B0 && tt::llrt::OptionsG.get_watcher_enabled()) { + if (device->arch() == ARCH::WORMHOLE_B0 && tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { std::vector read_data; for (const CoreCoord& eth_core : device->get_active_ethernet_cores()) { CoreCoord phys_core = device->ethernet_core_from_logical_core(eth_core); @@ -272,7 +272,7 @@ void WatcherDeviceReader::Dump(FILE* file) { paused_cores_str += "\n"; fprintf(f, "%s", paused_cores_str.c_str()); log_info(LogLLRuntime, "{}Press ENTER to unpause core(s) and continue...", paused_cores_str); - if (!tt::llrt::OptionsG.get_watcher_auto_unpause()) { + if (!tt::llrt::RunTimeOptions::get_instance().get_watcher_auto_unpause()) { while (std::cin.get() != '\n') { ; } @@ -343,7 +343,7 @@ void WatcherDeviceReader::DumpCore(CoreDescriptor& logical_core, bool is_active_ if (enabled) { // Dump state only gathered if device is compiled w/ watcher - if (!tt::llrt::OptionsG.watcher_status_disabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().watcher_status_disabled()) { DumpWaypoints(core, mbox_data, false); } // Ethernet cores have firmware that starts at address 0, so no need to check it for a @@ -351,16 +351,16 @@ void WatcherDeviceReader::DumpCore(CoreDescriptor& logical_core, bool is_active_ if (!is_eth_core) { DumpL1Status(core, &mbox_data->launch[launch_msg_read_ptr]); } - if (!tt::llrt::OptionsG.watcher_noc_sanitize_disabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().watcher_noc_sanitize_disabled()) { const auto NUM_NOCS_ = tt::tt_metal::hal.get_num_nocs(); for (uint32_t noc = 0; noc < NUM_NOCS_; noc++) { DumpNocSanitizeStatus(core, core_str, mbox_data, noc); } } - if (!tt::llrt::OptionsG.watcher_assert_disabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().watcher_assert_disabled()) { DumpAssertStatus(core, core_str, mbox_data); } - if (!tt::llrt::OptionsG.watcher_pause_disabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().watcher_pause_disabled()) { DumpPauseStatus(core, core_str, mbox_data); } } @@ -369,7 +369,7 @@ void WatcherDeviceReader::DumpCore(CoreDescriptor& logical_core, bool is_active_ if (!is_eth_core) { // Dump state always available DumpLaunchMessage(core, mbox_data); - if (tt::llrt::OptionsG.get_watcher_dump_all()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_dump_all()) { // Reading registers while running can cause hangs, only read if // requested explicitly DumpSyncRegs(core); @@ -397,10 +397,10 @@ void WatcherDeviceReader::DumpCore(CoreDescriptor& logical_core, bool is_active_ // Ring buffer at the end because it can print a bunch of data, same for stack usage if (enabled) { - if (!tt::llrt::OptionsG.watcher_stack_usage_disabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().watcher_stack_usage_disabled()) { DumpStackUsage(core, mbox_data); } - if (!tt::llrt::OptionsG.watcher_ring_buffer_disabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().watcher_ring_buffer_disabled()) { DumpRingBuffer(core, mbox_data, false); } } diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp index 9b81e7d13b01..3783fae96cc0 100644 --- a/tt_metal/impl/debug/watcher_server.cpp +++ b/tt_metal/impl/debug/watcher_server.cpp @@ -72,8 +72,8 @@ static double get_elapsed_secs() { void create_log_file() { FILE* f; - const char* fmode = tt::llrt::OptionsG.get_watcher_append() ? "a" : "w"; - std::filesystem::path output_dir(tt::llrt::OptionsG.get_root_dir() + watcher::logfile_path); + const char* fmode = tt::llrt::RunTimeOptions::get_instance().get_watcher_append() ? "a" : "w"; + std::filesystem::path output_dir(tt::llrt::RunTimeOptions::get_instance().get_root_dir() + watcher::logfile_path); std::filesystem::create_directories(output_dir); string fname = output_dir.string() + watcher::logfile_name; if ((f = fopen(fname.c_str(), fmode)) == nullptr) { @@ -110,8 +110,8 @@ void create_log_file() { void create_kernel_file() { FILE* f; - const char* fmode = tt::llrt::OptionsG.get_watcher_append() ? "a" : "w"; - std::filesystem::path output_dir(tt::llrt::OptionsG.get_root_dir() + watcher::logfile_path); + const char* fmode = tt::llrt::RunTimeOptions::get_instance().get_watcher_append() ? "a" : "w"; + std::filesystem::path output_dir(tt::llrt::RunTimeOptions::get_instance().get_root_dir() + watcher::logfile_path); std::filesystem::create_directories(output_dir); string fname = output_dir.string() + watcher::kernel_file_name; if ((f = fopen(fname.c_str(), fmode)) == nullptr) { @@ -139,7 +139,7 @@ static void watcher_loop(int sleep_usecs) { // Print to the user which features are disabled via env vars. string disabled_features = ""; - auto& disabled_features_set = tt::llrt::OptionsG.get_watcher_disabled_features(); + auto& disabled_features_set = tt::llrt::RunTimeOptions::get_instance().get_watcher_disabled_features(); if (!disabled_features_set.empty()) { for (auto& feature : disabled_features_set) { disabled_features += feature + ","; @@ -183,7 +183,7 @@ static void watcher_loop(int sleep_usecs) { dump(logfile); } catch (std::runtime_error& e) { // Depending on whether test mode is enabled, catch and stop server, or re-throw. - if (tt::llrt::OptionsG.get_test_mode_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_test_mode_enabled()) { watcher::watcher_killed_due_to_error = true; watcher::enabled = false; break; @@ -210,7 +210,7 @@ void watcher_init(Device* device) { watcher_msg_t* data = reinterpret_cast(&(watcher_init_val[0])); // Initialize watcher enable flag according to user setting. - data->enable = (tt::llrt::OptionsG.get_watcher_enabled()) ? WatcherEnabled : WatcherDisabled; + data->enable = (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) ? WatcherEnabled : WatcherDisabled; // Initialize debug status values to "unknown" for (int idx = 0; idx < MAX_RISCV_PER_CORE; idx++) { @@ -257,15 +257,15 @@ void watcher_init(Device* device) { for (tt::llrt::RunTimeDebugFeatures delay_feature = tt::llrt::RunTimeDebugFeatureReadDebugDelay; (int)delay_feature <= tt::llrt::RunTimeDebugFeatureAtomicDebugDelay; delay_feature = (tt::llrt::RunTimeDebugFeatures)((int)delay_feature + 1)) { - std::vector chip_ids = tt::llrt::OptionsG.get_feature_chip_ids(delay_feature); - bool this_chip_enabled = tt::llrt::OptionsG.get_feature_all_chips(delay_feature) || + std::vector chip_ids = tt::llrt::RunTimeOptions::get_instance().get_feature_chip_ids(delay_feature); + bool this_chip_enabled = tt::llrt::RunTimeOptions::get_instance().get_feature_all_chips(delay_feature) || std::find(chip_ids.begin(), chip_ids.end(), device->id()) != chip_ids.end(); if (this_chip_enabled) { static_assert(sizeof(debug_sanitize_noc_addr_msg_t) % sizeof(uint32_t) == 0); debug_insert_delays_msg_t delay_setup; // Create the mask based on the feature - uint32_t hart_mask = tt::llrt::OptionsG.get_feature_riscv_mask(delay_feature); + uint32_t hart_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask(delay_feature); switch (delay_feature) { case tt::llrt::RunTimeDebugFeatureReadDebugDelay: delay_setup.read_delay_riscv_mask = hart_mask; break; case tt::llrt::RunTimeDebugFeatureWriteDebugDelay: @@ -278,7 +278,8 @@ void watcher_init(Device* device) { } for (CoreType core_type : {CoreType::WORKER, CoreType::ETH}) { - std::vector delayed_cores = tt::llrt::OptionsG.get_feature_cores(delay_feature)[core_type]; + std::vector delayed_cores = + tt::llrt::RunTimeOptions::get_instance().get_feature_cores(delay_feature)[core_type]; for (tt_xy_pair logical_core : delayed_cores) { CoreCoord phys_core; bool valid_logical_core = true; @@ -324,7 +325,7 @@ void watcher_init(Device* device) { delay.second.read_delay_riscv_mask, delay.second.write_delay_riscv_mask, delay.second.atomic_delay_riscv_mask, - tt::llrt::OptionsG.get_watcher_debug_delay()); + tt::llrt::RunTimeOptions::get_instance().get_watcher_debug_delay()); } debug_insert_delays_msg_t debug_delays_val_zero = {0, 0, 0, 0}; @@ -381,7 +382,7 @@ void watcher_init(Device* device) { void watcher_attach(Device* device) { const std::lock_guard lock(watcher::watch_mutex); - if (!watcher::enabled && tt::llrt::OptionsG.get_watcher_enabled()) { + if (!watcher::enabled && tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { watcher::create_log_file(); if (!watcher::kernel_file) { watcher::create_kernel_file(); @@ -391,7 +392,7 @@ void watcher_attach(Device* device) { watcher::enabled = true; - int sleep_usecs = tt::llrt::OptionsG.get_watcher_interval() * 1000; + int sleep_usecs = tt::llrt::RunTimeOptions::get_instance().get_watcher_interval() * 1000; std::thread watcher_thread = std::thread(&watcher::watcher_loop, sleep_usecs); watcher_thread.detach(); } @@ -466,7 +467,7 @@ void watcher_server_set_error_flag(bool val) { watcher::watcher_killed_due_to_er void watcher_clear_log() { watcher::create_log_file(); } string watcher_get_log_file_name() { - return tt::llrt::OptionsG.get_root_dir() + watcher::logfile_path + watcher::logfile_name; + return tt::llrt::RunTimeOptions::get_instance().get_root_dir() + watcher::logfile_path + watcher::logfile_name; } int watcher_get_dump_count() { return watcher::dump_count; } @@ -479,7 +480,7 @@ void watcher_dump() { } void watcher_read_kernel_ids_from_file() { - std::filesystem::path output_dir(tt::llrt::OptionsG.get_root_dir() + watcher::logfile_path); + std::filesystem::path output_dir(tt::llrt::RunTimeOptions::get_instance().get_root_dir() + watcher::logfile_path); string fname = output_dir.string() + watcher::kernel_file_name; FILE* f; if ((f = fopen(fname.c_str(), "r")) == nullptr) { diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index f55271e3679b..e2aaf7fec52b 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -195,7 +195,7 @@ void Device::get_associated_dispatch_virtual_cores( void Device::initialize_cluster() { ZoneScoped; - if (llrt::OptionsG.get_clear_l1()) { + if (llrt::RunTimeOptions::get_instance().get_clear_l1()) { this->clear_l1_state(); } int ai_clk = tt::Cluster::instance().get_device_aiclk(this->id_); @@ -448,7 +448,8 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC launch_msg->kernel_config.ncrisc_kernel_size16 = (fw_size + 15) >> 4; } log_debug(LogDevice, "RISC {} fw binary size: {} in bytes", riscv_id, fw_size); - if (not llrt::OptionsG.get_skip_loading_fw()) { + + if (not llrt::RunTimeOptions::get_instance().get_skip_loading_fw()) { llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (riscv_id - build_idx)); } } @@ -479,7 +480,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC if (is_idle_eth) { tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), virtual_core)); } - if (not llrt::OptionsG.get_skip_loading_fw()) { + if (not llrt::RunTimeOptions::get_instance().get_skip_loading_fw()) { for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); for (uint32_t eriscv_id = build_idx; eriscv_id < (build_idx + num_build_states); eriscv_id++) { @@ -857,7 +858,7 @@ void Device::configure_kernel_variant( if (force_watcher_no_inline) { defines.insert({"WATCHER_NOINLINE", std::to_string(force_watcher_no_inline)}); } - if (llrt::OptionsG.watcher_dispatch_disabled()) { + if (llrt::RunTimeOptions::get_instance().watcher_dispatch_disabled()) { defines["FORCE_WATCHER_OFF"] = "1"; } if (!DPrintServerReadsDispatchCores(this)) { @@ -2343,7 +2344,7 @@ void Device::compile_command_queue_programs() { false, false, // TEMP: Disable function inlining on Prefetcher when watcher is enabled but no_inline is not specified to respect code space - tt::llrt::OptionsG.get_watcher_enabled() && (not tt::llrt::OptionsG.get_watcher_noinline()) + tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled() && (not tt::llrt::RunTimeOptions::get_instance().get_watcher_noinline()) ); uint32_t tensix_worker_go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::GO_MSG); @@ -2499,7 +2500,7 @@ void Device::compile_command_queue_programs() { false, false, // TEMP: Disable function inlining on Prefetcher when watcher is enabled but no_inline is not specified to respect code space - tt::llrt::OptionsG.get_watcher_enabled() && (not tt::llrt::OptionsG.get_watcher_noinline()) + tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled() && (not tt::llrt::RunTimeOptions::get_instance().get_watcher_noinline()) ); cq_id = (cq_id + 1) % num_hw_cqs; } @@ -2680,7 +2681,7 @@ void Device::compile_command_queue_programs() { false, false, // TEMP: Disable function inlining on Prefetcher when watcher is enabled but no_inline is not specified to respect code space - tt::llrt::OptionsG.get_watcher_enabled() && (not tt::llrt::OptionsG.get_watcher_noinline()) + tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled() && (not tt::llrt::RunTimeOptions::get_instance().get_watcher_noinline()) ); cq_id = (cq_id + 1) % num_hw_cqs; } @@ -2915,7 +2916,7 @@ void Device::init_command_queue_host() { void Device::init_command_queue_device() { - if (llrt::OptionsG.get_skip_loading_fw()) { + if (llrt::RunTimeOptions::get_instance().get_skip_loading_fw()) { detail::EnablePersistentKernelCache(); this->compile_command_queue_programs(); detail::DisablePersistentKernelCache(); diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 0cf531b11c6f..392971bf061f 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -243,7 +243,7 @@ void DevicePool::initialize_device(v1::DeviceHandle handle) const { watcher_init(dev); // TODO: as optimization, investigate removing all this call for already initialized devivces - if (!llrt::OptionsG.get_skip_reset_cores_on_init()) { + if (!llrt::RunTimeOptions::get_instance().get_skip_reset_cores_on_init()) { dev->reset_cores(); } dev->initialize_and_launch_firmware(); diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index d677a362cd72..afc6b344740b 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -2490,7 +2490,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { program.set_last_used_command_queue_for_testing(this); #ifdef DEBUG - if (tt::llrt::OptionsG.get_validate_kernel_binaries()) { + if (tt::llrt::RunTimeOptions::get_instance().get_validate_kernel_binaries()) { TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries"); if (const auto &buffer = program.get_kernels_buffer()) { std::vector read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t)); @@ -2549,7 +2549,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { this->enqueue_command(command, blocking, sub_device_ids); #ifdef DEBUG - if (tt::llrt::OptionsG.get_validate_kernel_binaries()) { + if (tt::llrt::RunTimeOptions::get_instance().get_validate_kernel_binaries()) { TT_FATAL(!this->manager.get_bypass_mode(), "Tracing cannot be used while validating program binaries"); if (const auto& buffer = program.get_kernels_buffer()) { std::vector read_data(buffer->page_size() * buffer->num_pages() / sizeof(uint32_t)); @@ -2915,7 +2915,7 @@ void HWCommandQueue::finish(tt::stl::Span sub_device_ids) { tt::log_debug(tt::LogDispatch, "Finish for command queue {}", this->id); std::shared_ptr event = std::make_shared(); this->enqueue_record_event(event, false, sub_device_ids); - if (tt::llrt::OptionsG.get_test_mode_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_test_mode_enabled()) { while (this->num_entries_in_completion_q > this->num_completed_completion_q_reads) { if (DPrintServerHangDetected()) { // DPrint Server hang. Mark state and early exit. Assert in main thread. @@ -3189,7 +3189,7 @@ void EventSynchronize(const std::shared_ptr& event) { event->event_id); while (event->device->sysmem_manager().get_last_completed_event(event->cq_id) < event->event_id) { - if (tt::llrt::OptionsG.get_test_mode_enabled() && tt::watcher_server_killed_due_to_error()) { + if (tt::llrt::RunTimeOptions::get_instance().get_test_mode_enabled() && tt::watcher_server_killed_due_to_error()) { TT_FATAL( false, "Command Queue could not complete EventSynchronize. See {} for details.", diff --git a/tt_metal/impl/dispatch/data_collection.cpp b/tt_metal/impl/dispatch/data_collection.cpp index baee439a3923..3b04ce19a878 100644 --- a/tt_metal/impl/dispatch/data_collection.cpp +++ b/tt_metal/impl/dispatch/data_collection.cpp @@ -255,7 +255,7 @@ namespace tt { void RecordDispatchData(Program& program, data_collector_t type, uint32_t transaction_size, RISCV riscv) { // Do nothing if we're not enabling data collection. - if (!tt::llrt::OptionsG.get_dispatch_data_collection_enabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().get_dispatch_data_collection_enabled()) { return; } @@ -265,7 +265,7 @@ void RecordDispatchData(Program& program, data_collector_t type, uint32_t transa void RecordKernelGroups(Program& program, CoreType core_type, std::vector& kernel_groups) { // Do nothing if we're not enabling data collection. - if (!tt::llrt::OptionsG.get_dispatch_data_collection_enabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().get_dispatch_data_collection_enabled()) { return; } @@ -275,7 +275,7 @@ void RecordKernelGroups(Program& program, CoreType core_type, std::vector& kernel, JitBuildOptions std::to_string(std::hash{}(build_options.hlk_desc)), kernel->compute_hash(), device_kernel_defines_hash, - tt::llrt::OptionsG.get_watcher_enabled()); + tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()); for (int i = 0; i < llrt::RunTimeDebugFeatureCount; i++) { compile_hash_str += "_"; - compile_hash_str += tt::llrt::OptionsG.get_feature_hash_string((llrt::RunTimeDebugFeatures)i); + compile_hash_str += tt::llrt::RunTimeOptions::get_instance().get_feature_hash_string((llrt::RunTimeDebugFeatures)i); } size_t compile_hash = std::hash{}(compile_hash_str); diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 927155088612..d336abaca3ac 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -44,7 +44,7 @@ JitBuildEnv::JitBuildEnv() {} void JitBuildEnv::init( uint32_t build_key, tt::ARCH arch, const std::map& device_kernel_defines) { // Paths - this->root_ = llrt::OptionsG.get_root_dir(); + this->root_ = llrt::RunTimeOptions::get_instance().get_root_dir(); this->out_root_ = this->root_ + "built/"; this->arch_ = arch; this->arch_name_ = get_string_lowercase(arch); @@ -66,7 +66,7 @@ void JitBuildEnv::init( } common_flags += "-std=c++17 -flto -ffast-math "; - if (tt::llrt::OptionsG.get_riscv_debug_info_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_riscv_debug_info_enabled()) { common_flags += "-g "; } @@ -92,7 +92,7 @@ void JitBuildEnv::init( this->defines_ += "-DTENSIX_FIRMWARE -DLOCAL_MEM_EN=0 "; if (tt::tt_metal::getDeviceProfilerState()) { - if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) { + if (tt::llrt::RunTimeOptions::get_instance().get_profiler_do_dispatch_cores()) { // TODO(MO): Standard bit mask for device side profiler options this->defines_ += "-DPROFILE_KERNEL=2 "; } else { @@ -100,32 +100,34 @@ void JitBuildEnv::init( } } - if (tt::llrt::OptionsG.get_watcher_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { this->defines_ += "-DWATCHER_ENABLED "; } - if (tt::llrt::OptionsG.get_watcher_noinline()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_noinline()) { this->defines_ += "-DWATCHER_NOINLINE "; } - for (auto& feature : tt::llrt::OptionsG.get_watcher_disabled_features()) { + for (auto& feature : tt::llrt::RunTimeOptions::get_instance().get_watcher_disabled_features()) { this->defines_ += "-DWATCHER_DISABLE_" + feature + " "; } - if (tt::llrt::OptionsG.get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint)) { + if (tt::llrt::RunTimeOptions::get_instance().get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint)) { this->defines_ += "-DDEBUG_PRINT_ENABLED -DL1_UNRESERVED_BASE=" + to_string(hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED)) + " "; } - if (tt::llrt::OptionsG.get_record_noc_transfers()) { + if (tt::llrt::RunTimeOptions::get_instance().get_record_noc_transfers()) { this->defines_ += "-DNOC_LOGGING_ENABLED "; } - if (tt::llrt::OptionsG.get_kernels_nullified()) { + if (tt::llrt::RunTimeOptions::get_instance().get_kernels_nullified()) { this->defines_ += "-DDEBUG_NULL_KERNELS "; } - if (tt::llrt::OptionsG.get_watcher_debug_delay()) { - this->defines_ += "-DWATCHER_DEBUG_DELAY=" + to_string(tt::llrt::OptionsG.get_watcher_debug_delay()) + " "; + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_debug_delay()) { + this->defines_ += + "-DWATCHER_DEBUG_DELAY=" + to_string(tt::llrt::RunTimeOptions::get_instance().get_watcher_debug_delay()) + + " "; } // Includes @@ -183,7 +185,8 @@ void JitBuildState::finish_init() { } // Append hw build objects compiled offline - std::string build_dir = llrt::OptionsG.get_root_dir() + "runtime/hw/lib/" + get_alias(env_.arch_) + "/"; + std::string build_dir = + llrt::RunTimeOptions::get_instance().get_root_dir() + "runtime/hw/lib/" + get_alias(env_.arch_) + "/"; if (this->is_fw_) { if (this->target_name_ == "brisc") { this->link_objs_ += build_dir + "tdma_xmov.o "; @@ -228,8 +231,8 @@ JitBuildDataMovement::JitBuildDataMovement(const JitBuildEnv& env, const JitBuil this->defines_ = env_.defines_; - uint32_t l1_cache_disable_mask = - tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDisableL1DataCache); + uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( + tt::llrt::RunTimeDebugFeatureDisableL1DataCache); this->lflags_ = env_.lflags_ + "-Os "; @@ -296,8 +299,8 @@ JitBuildCompute::JitBuildCompute(const JitBuildEnv& env, const JitBuiltStateConf this->cflags_ = env_.cflags_ + "-O3 "; this->defines_ = env_.defines_; - uint32_t l1_cache_disable_mask = - tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDisableL1DataCache); + uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( + tt::llrt::RunTimeDebugFeatureDisableL1DataCache); uint32_t debug_compute_mask = (tt::llrt::DebugHartFlags::RISCV_TR0 | tt::llrt::DebugHartFlags::RISCV_TR1 | tt::llrt::DebugHartFlags::RISCV_TR2); @@ -389,8 +392,8 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit "/metal/llk_io "; this->defines_ = env_.defines_; - uint32_t l1_cache_disable_mask = - tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDisableL1DataCache); + uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( + tt::llrt::RunTimeDebugFeatureDisableL1DataCache); if ((l1_cache_disable_mask & tt::llrt::DebugHartFlags::RISCV_ER) == tt::llrt::DebugHartFlags::RISCV_ER) { this->defines_ += "-DDISABLE_L1_DATA_CACHE "; } @@ -449,8 +452,8 @@ JitBuildIdleEthernet::JitBuildIdleEthernet(const JitBuildEnv& env, const JitBuil "/metal/llk_io "; this->defines_ = env_.defines_; - uint32_t l1_cache_disable_mask = - tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDisableL1DataCache); + uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( + tt::llrt::RunTimeDebugFeatureDisableL1DataCache); if ((l1_cache_disable_mask & tt::llrt::DebugHartFlags::RISCV_ER) == tt::llrt::DebugHartFlags::RISCV_ER) { this->defines_ += "-DDISABLE_L1_DATA_CACHE "; } @@ -561,7 +564,7 @@ void JitBuildState::compile_one( log_debug(tt::LogBuildKernels, " g++ compile cmd: {}", cmd); - if (tt::llrt::OptionsG.get_watcher_enabled() && settings) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled() && settings) { log_kernel_defines_and_args(out_dir, settings->get_full_kernel_name(), defines); } @@ -582,7 +585,7 @@ void JitBuildState::compile(const string& log_file, const string& out_dir, const } sync_build_step(events); - if (tt::llrt::OptionsG.get_watcher_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { dump_kernel_defines_and_args(env_.get_out_kernel_root_path()); } } @@ -590,7 +593,7 @@ void JitBuildState::compile(const string& log_file, const string& out_dir, const void JitBuildState::link(const string& log_file, const string& out_dir) const { // ZoneScoped; string lflags = this->lflags_; - if (tt::llrt::OptionsG.get_build_map_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_build_map_enabled()) { lflags += "-Wl,-Map=" + out_dir + "linker.map "; } diff --git a/tt_metal/jit_build/build.hpp b/tt_metal/jit_build/build.hpp index ccd4a7860d25..9d3d481836d3 100644 --- a/tt_metal/jit_build/build.hpp +++ b/tt_metal/jit_build/build.hpp @@ -182,7 +182,7 @@ inline const string jit_build_get_kernel_compile_outpath(int build_key) { // TODO(pgk), get rid of this // The test infra needs the output dir. Could put this in the device, but we plan // to remove the device dependence in the future, so putting this here for now - return llrt::OptionsG.get_root_dir() + "/built/" + std::to_string(build_key) + "/kernels/"; + return llrt::RunTimeOptions::get_instance().get_root_dir() + "/built/" + std::to_string(build_key) + "/kernels/"; } inline void launch_build_step(const std::function build_func, std::vector>& events) { diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp index ab920c1d1b05..2438180c4549 100644 --- a/tt_metal/jit_build/genfiles.cpp +++ b/tt_metal/jit_build/genfiles.cpp @@ -50,12 +50,12 @@ static fs::path get_file_path_relative_to_dir(const string& dir, const fs::path& static fs::path get_relative_file_path_from_config(const fs::path& file_path) { fs::path file_path_relative_to_dir; - if (llrt::OptionsG.is_root_dir_specified()) { - file_path_relative_to_dir = get_file_path_relative_to_dir(llrt::OptionsG.get_root_dir(), file_path); + if (llrt::RunTimeOptions::get_instance().is_root_dir_specified()) { + file_path_relative_to_dir = get_file_path_relative_to_dir(llrt::RunTimeOptions::get_instance().get_root_dir(), file_path); } - if (!fs::exists(file_path_relative_to_dir) && llrt::OptionsG.is_kernel_dir_specified()) { - file_path_relative_to_dir = get_file_path_relative_to_dir(llrt::OptionsG.get_kernel_dir(), file_path); + if (!fs::exists(file_path_relative_to_dir) && llrt::RunTimeOptions::get_instance().is_kernel_dir_specified()) { + file_path_relative_to_dir = get_file_path_relative_to_dir(llrt::RunTimeOptions::get_instance().get_kernel_dir(), file_path); } return file_path_relative_to_dir; diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index ccea2588f93f..71c22599113a 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -307,8 +307,8 @@ void wait_until_cores_done( // Continuously polling cores here can cause other host-driven noc transactions (dprint, watcher) to drastically // slow down for remote devices. So when debugging with these features, add a small delay to allow other // host-driven transactions through. - if (llrt::OptionsG.get_watcher_enabled() || - llrt::OptionsG.get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint)) + if (llrt::RunTimeOptions::get_instance().get_watcher_enabled() || + llrt::RunTimeOptions::get_instance().get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint)) std::this_thread::sleep_for(std::chrono::milliseconds(5)); } } diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index ba0783f94dca..174bd2cdbce4 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -19,7 +19,7 @@ namespace tt { namespace llrt { -const char *RunTimeDebugFeatureNames[RunTimeDebugFeatureCount] = { +const char* RunTimeDebugFeatureNames[RunTimeDebugFeatureCount] = { "DPRINT", "READ_DEBUG_DELAY", "WRITE_DEBUG_DELAY", @@ -27,24 +27,19 @@ const char *RunTimeDebugFeatureNames[RunTimeDebugFeatureCount] = { "DISABLE_L1_DATA_CACHE", }; -const char *RunTimeDebugClassNames[RunTimeDebugClassCount] = {"N/A", "worker", "dispatch", "all"}; +const char* RunTimeDebugClassNames[RunTimeDebugClassCount] = {"N/A", "worker", "dispatch", "all"}; -static const char *TT_METAL_HOME_ENV_VAR = "TT_METAL_HOME"; -static const char *TT_METAL_KERNEL_PATH_ENV_VAR = "TT_METAL_KERNEL_PATH"; - -// Note: global initialization order is non-deterministic -// This is ok so long as this gets initialized before decisions are based on -// env state -RunTimeOptions OptionsG; +static const char* TT_METAL_HOME_ENV_VAR = "TT_METAL_HOME"; +static const char* TT_METAL_KERNEL_PATH_ENV_VAR = "TT_METAL_KERNEL_PATH"; RunTimeOptions::RunTimeOptions() { - const char *root_dir_str = std::getenv(TT_METAL_HOME_ENV_VAR); + const char* root_dir_str = std::getenv(TT_METAL_HOME_ENV_VAR); if (root_dir_str != nullptr) { this->is_root_dir_env_var_set = true; this->root_dir = std::string(root_dir_str) + "/"; } - const char *kernel_dir_str = std::getenv(TT_METAL_KERNEL_PATH_ENV_VAR); + const char* kernel_dir_str = std::getenv(TT_METAL_KERNEL_PATH_ENV_VAR); if (kernel_dir_str != nullptr) { this->is_kernel_dir_env_var_set = true; this->kernel_dir = std::string(kernel_dir_str) + "/"; @@ -66,19 +61,19 @@ RunTimeOptions::RunTimeOptions() { profiler_sync_enabled = false; profiler_buffer_usage_enabled = false; #if defined(TRACY_ENABLE) - const char *profiler_enabled_str = std::getenv("TT_METAL_DEVICE_PROFILER"); + const char* profiler_enabled_str = std::getenv("TT_METAL_DEVICE_PROFILER"); if (profiler_enabled_str != nullptr && profiler_enabled_str[0] == '1') { profiler_enabled = true; - const char *profile_dispatch_str = std::getenv("TT_METAL_DEVICE_PROFILER_DISPATCH"); + const char* profile_dispatch_str = std::getenv("TT_METAL_DEVICE_PROFILER_DISPATCH"); if (profile_dispatch_str != nullptr && profile_dispatch_str[0] == '1') { profile_dispatch_cores = true; } - const char *profiler_sync_enabled_str = std::getenv("TT_METAL_PROFILER_SYNC"); + const char* profiler_sync_enabled_str = std::getenv("TT_METAL_PROFILER_SYNC"); if (profiler_enabled && profiler_sync_enabled_str != nullptr && profiler_sync_enabled_str[0] == '1') { profiler_sync_enabled = true; } } - const char *profile_buffer_usage_str = std::getenv("TT_METAL_MEM_PROFILER"); + const char* profile_buffer_usage_str = std::getenv("TT_METAL_MEM_PROFILER"); if (profile_buffer_usage_str != nullptr && profile_buffer_usage_str[0] == '1') { profiler_buffer_usage_enabled = true; } @@ -90,30 +85,32 @@ RunTimeOptions::RunTimeOptions() { null_kernels = (std::getenv("TT_METAL_NULL_KERNELS") != nullptr); clear_l1 = false; - const char *clear_l1_enabled_str = std::getenv("TT_METAL_CLEAR_L1"); + const char* clear_l1_enabled_str = std::getenv("TT_METAL_CLEAR_L1"); if (clear_l1_enabled_str != nullptr) { - if (clear_l1_enabled_str[0] == '0') + if (clear_l1_enabled_str[0] == '0') { clear_l1 = false; - if (clear_l1_enabled_str[0] == '1') + } + if (clear_l1_enabled_str[0] == '1') { clear_l1 = true; + } } - const char *riscv_debug_info_enabled_str = std::getenv("TT_METAL_RISCV_DEBUG_INFO"); + const char* riscv_debug_info_enabled_str = std::getenv("TT_METAL_RISCV_DEBUG_INFO"); set_riscv_debug_info_enabled(riscv_debug_info_enabled_str != nullptr); - const char *validate_kernel_binaries = std::getenv("TT_METAL_VALIDATE_PROGRAM_BINARIES"); + const char* validate_kernel_binaries = std::getenv("TT_METAL_VALIDATE_PROGRAM_BINARIES"); set_validate_kernel_binaries(validate_kernel_binaries != nullptr && validate_kernel_binaries[0] == '1'); - const char *num_cqs = getenv("TT_METAL_GTEST_NUM_HW_CQS"); + const char* num_cqs = getenv("TT_METAL_GTEST_NUM_HW_CQS"); if (num_cqs != nullptr) { try { set_num_hw_cqs(std::stoi(num_cqs)); - } catch (const std::invalid_argument &ia) { + } catch (const std::invalid_argument& ia) { TT_THROW("Invalid TT_METAL_GTEST_NUM_HW_CQS: {}", num_cqs); } } - const char *dispatch_data_collection_str = std::getenv("TT_METAL_DISPATCH_DATA_COLLECTION"); + const char* dispatch_data_collection_str = std::getenv("TT_METAL_DISPATCH_DATA_COLLECTION"); if (dispatch_data_collection_str != nullptr) { enable_dispatch_data_collection = true; } @@ -127,7 +124,7 @@ RunTimeOptions::RunTimeOptions() { } } -const std::string &RunTimeOptions::get_root_dir() { +const std::string& RunTimeOptions::get_root_dir() { if (!this->is_root_dir_specified()) { TT_THROW("Env var {} is not set.", TT_METAL_HOME_ENV_VAR); } @@ -135,7 +132,7 @@ const std::string &RunTimeOptions::get_root_dir() { return root_dir; } -const std::string &RunTimeOptions::get_kernel_dir() const { +const std::string& RunTimeOptions::get_kernel_dir() const { if (!this->is_kernel_dir_specified()) { TT_THROW("Env var {} is not set.", TT_METAL_KERNEL_PATH_ENV_VAR); } @@ -145,7 +142,7 @@ const std::string &RunTimeOptions::get_kernel_dir() const { void RunTimeOptions::ParseWatcherEnv() { watcher_interval_ms = 0; - const char *watcher_enable_str = getenv("TT_METAL_WATCHER"); + const char* watcher_enable_str = getenv("TT_METAL_WATCHER"); watcher_enabled = (watcher_enable_str != nullptr); if (watcher_enabled) { int sleep_val = 0; @@ -156,13 +153,13 @@ void RunTimeOptions::ParseWatcherEnv() { watcher_interval_ms = sleep_val; } - const char *watcher_dump_all_str = getenv("TT_METAL_WATCHER_DUMP_ALL"); + const char* watcher_dump_all_str = getenv("TT_METAL_WATCHER_DUMP_ALL"); watcher_dump_all = (watcher_dump_all_str != nullptr); - const char *watcher_append_str = getenv("TT_METAL_WATCHER_APPEND"); + const char* watcher_append_str = getenv("TT_METAL_WATCHER_APPEND"); watcher_append = (watcher_append_str != nullptr); - const char *watcher_noinline_str = getenv("TT_METAL_WATCHER_NOINLINE"); + const char* watcher_noinline_str = getenv("TT_METAL_WATCHER_NOINLINE"); watcher_noinline = (watcher_noinline_str != nullptr); // Auto unpause is for testing only, no env var. @@ -185,7 +182,7 @@ void RunTimeOptions::ParseWatcherEnv() { } } - const char *watcher_debug_delay_str = getenv("TT_METAL_WATCHER_DEBUG_DELAY"); + const char* watcher_debug_delay_str = getenv("TT_METAL_WATCHER_DEBUG_DELAY"); if (watcher_debug_delay_str != nullptr) { sscanf(watcher_debug_delay_str, "%u", &watcher_debug_delay); // Assert watcher is also enabled (TT_METAL_WATCHER=1) @@ -210,21 +207,26 @@ void RunTimeOptions::ParseFeatureEnv(RunTimeDebugFeatures feature) { // Set feature enabled if the user asked for any feature cores feature_targets[feature].enabled = false; - for (auto &core_type_and_all_flag : feature_targets[feature].all_cores) - if (core_type_and_all_flag.second != RunTimeDebugClassNoneSpecified) + for (auto& core_type_and_all_flag : feature_targets[feature].all_cores) { + if (core_type_and_all_flag.second != RunTimeDebugClassNoneSpecified) { feature_targets[feature].enabled = true; - for (auto &core_type_and_cores : feature_targets[feature].cores) - if (core_type_and_cores.second.size() > 0) + } + } + for (auto& core_type_and_cores : feature_targets[feature].cores) { + if (core_type_and_cores.second.size() > 0) { feature_targets[feature].enabled = true; + } + } - const char *print_noc_xfers = std::getenv("TT_METAL_RECORD_NOC_TRANSFER_DATA"); - if (print_noc_xfers != nullptr) + const char* print_noc_xfers = std::getenv("TT_METAL_RECORD_NOC_TRANSFER_DATA"); + if (print_noc_xfers != nullptr) { record_noc_transfer_data = true; + } }; void RunTimeOptions::ParseFeatureCoreRange( - RunTimeDebugFeatures feature, const std::string &env_var, CoreType core_type) { - char *str = std::getenv(env_var.c_str()); + RunTimeDebugFeatures feature, const std::string& env_var, CoreType core_type) { + char* str = std::getenv(env_var.c_str()); std::vector cores; // Check if "all" is specified, rather than a range of cores. @@ -271,8 +273,9 @@ void RunTimeOptions::ParseFeatureCoreRange( cores.push_back({x, y}); str = strchr(str, ','); str = strchr(str + 1, ','); - if (str != nullptr) + if (str != nullptr) { str++; + } } } } else { @@ -284,9 +287,9 @@ void RunTimeOptions::ParseFeatureCoreRange( feature_targets[feature].cores[core_type] = cores; } -void RunTimeOptions::ParseFeatureChipIds(RunTimeDebugFeatures feature, const std::string &env_var) { +void RunTimeOptions::ParseFeatureChipIds(RunTimeDebugFeatures feature, const std::string& env_var) { std::vector chips; - char *env_var_str = std::getenv(env_var.c_str()); + char* env_var_str = std::getenv(env_var.c_str()); // If the environment variable is not empty, parse it. while (env_var_str != nullptr) { @@ -296,19 +299,21 @@ void RunTimeOptions::ParseFeatureChipIds(RunTimeDebugFeatures feature, const std } chips.push_back(chip); env_var_str = strchr(env_var_str, ','); - if (env_var_str != nullptr) + if (env_var_str != nullptr) { env_var_str++; + } } // Default is no chips are specified is chip 0. - if (chips.size() == 0) + if (chips.size() == 0) { chips.push_back(0); + } feature_targets[feature].chip_ids = chips; } -void RunTimeOptions::ParseFeatureRiscvMask(RunTimeDebugFeatures feature, const std::string &env_var) { +void RunTimeOptions::ParseFeatureRiscvMask(RunTimeDebugFeatures feature, const std::string& env_var) { uint32_t riscv_mask = 0; - char *env_var_str = std::getenv(env_var.c_str()); + char* env_var_str = std::getenv(env_var.c_str()); if (env_var_str != nullptr) { if (strstr(env_var_str, "BR")) { @@ -341,13 +346,13 @@ void RunTimeOptions::ParseFeatureRiscvMask(RunTimeDebugFeatures feature, const s feature_targets[feature].riscv_mask = riscv_mask; } -void RunTimeOptions::ParseFeatureFileName(RunTimeDebugFeatures feature, const std::string &env_var) { - char *env_var_str = std::getenv(env_var.c_str()); +void RunTimeOptions::ParseFeatureFileName(RunTimeDebugFeatures feature, const std::string& env_var) { + char* env_var_str = std::getenv(env_var.c_str()); feature_targets[feature].file_name = (env_var_str != nullptr) ? std::string(env_var_str) : ""; } -void RunTimeOptions::ParseFeatureOneFilePerRisc(RunTimeDebugFeatures feature, const std::string &env_var) { - char *env_var_str = std::getenv(env_var.c_str()); +void RunTimeOptions::ParseFeatureOneFilePerRisc(RunTimeDebugFeatures feature, const std::string& env_var) { + char* env_var_str = std::getenv(env_var.c_str()); feature_targets[feature].one_file_per_risc = (env_var_str != nullptr); } diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp index 44dc03acaa9d..b7723f0609c5 100644 --- a/tt_metal/llrt/rtoptions.hpp +++ b/tt_metal/llrt/rtoptions.hpp @@ -23,7 +23,7 @@ namespace tt { namespace llrt { -static inline const char *get_core_type_name(CoreType ct) { +static inline const char* get_core_type_name(CoreType ct) { switch (ct) { case CoreType::ARC: return "ARC"; case CoreType::DRAM: return "DRAM"; @@ -67,8 +67,8 @@ enum RunTimeDebugClass { RunTimeDebugClassCount }; -extern const char *RunTimeDebugFeatureNames[RunTimeDebugFeatureCount]; -extern const char *RunTimeDebugClassNames[RunTimeDebugClassCount]; +extern const char* RunTimeDebugFeatureNames[RunTimeDebugFeatureCount]; +extern const char* RunTimeDebugClassNames[RunTimeDebugClassCount]; // TargetSelection stores the targets for a given debug feature. I.e. for which chips, cores, harts // to enable the feature. @@ -126,14 +126,22 @@ class RunTimeOptions { tt_metal::DispatchCoreConfig dispatch_core_config = tt_metal::DispatchCoreConfig{}; - public: RunTimeOptions(); +public: + static RunTimeOptions& get_instance() { + static RunTimeOptions instance; + return instance; + } + + RunTimeOptions(const RunTimeOptions&) = delete; + RunTimeOptions& operator=(const RunTimeOptions&) = delete; + inline bool is_root_dir_specified() const { return this->is_root_dir_env_var_set; } - const std::string &get_root_dir(); + const std::string& get_root_dir(); inline bool is_kernel_dir_specified() const { return this->is_kernel_dir_env_var_set; } - const std::string &get_kernel_dir() const; + const std::string& get_kernel_dir() const; inline bool get_build_map_enabled() { return build_map_enabled; } @@ -151,7 +159,7 @@ class RunTimeOptions { inline void set_watcher_auto_unpause(bool auto_unpause) { watcher_auto_unpause = auto_unpause; } inline int get_watcher_noinline() { return watcher_noinline; } inline void set_watcher_noinline(bool noinline) { watcher_noinline = noinline; } - inline std::set &get_watcher_disabled_features() { return watcher_disabled_features; } + inline std::set& get_watcher_disabled_features() { return watcher_disabled_features; } inline bool watcher_status_disabled() { return watcher_feature_disabled(watcher_waypoint_str); } inline bool watcher_noc_sanitize_disabled() { return watcher_feature_disabled(watcher_noc_sanitize_str); } inline bool watcher_assert_disabled() { return watcher_feature_disabled(watcher_assert_str); } @@ -167,7 +175,7 @@ class RunTimeOptions { feature_targets[feature].enabled = enabled; } // Note: dprint cores are logical - inline std::map> &get_feature_cores(RunTimeDebugFeatures feature) { + inline std::map>& get_feature_cores(RunTimeDebugFeatures feature) { return feature_targets[feature].cores; } inline void set_feature_cores(RunTimeDebugFeatures feature, std::map> cores) { @@ -190,7 +198,7 @@ class RunTimeOptions { } } } - inline std::vector &get_feature_chip_ids(RunTimeDebugFeatures feature) { + inline std::vector& get_feature_chip_ids(RunTimeDebugFeatures feature) { return feature_targets[feature].chip_ids; } inline void set_feature_chip_ids(RunTimeDebugFeatures feature, std::vector chip_ids) { @@ -282,14 +290,14 @@ class RunTimeOptions { inline tt_metal::DispatchCoreConfig get_dispatch_core_config() { return dispatch_core_config; } - private: +private: // Helper functions to parse feature-specific environment vaiables. void ParseFeatureEnv(RunTimeDebugFeatures feature); - void ParseFeatureCoreRange(RunTimeDebugFeatures feature, const std::string &env_var, CoreType core_type); - void ParseFeatureChipIds(RunTimeDebugFeatures feature, const std::string &env_var); - void ParseFeatureRiscvMask(RunTimeDebugFeatures feature, const std::string &env_var); - void ParseFeatureFileName(RunTimeDebugFeatures feature, const std::string &env_var); - void ParseFeatureOneFilePerRisc(RunTimeDebugFeatures feature, const std::string &env_var); + void ParseFeatureCoreRange(RunTimeDebugFeatures feature, const std::string& env_var, CoreType core_type); + void ParseFeatureChipIds(RunTimeDebugFeatures feature, const std::string& env_var); + void ParseFeatureRiscvMask(RunTimeDebugFeatures feature, const std::string& env_var); + void ParseFeatureFileName(RunTimeDebugFeatures feature, const std::string& env_var); + void ParseFeatureOneFilePerRisc(RunTimeDebugFeatures feature, const std::string& env_var); // Helper function to parse watcher-specific environment variables. void ParseWatcherEnv(); @@ -304,16 +312,14 @@ class RunTimeOptions { const std::string watcher_stack_usage_str = "STACK_USAGE"; const std::string watcher_dispatch_str = "DISPATCH"; std::set watcher_disabled_features; - bool watcher_feature_disabled(const std::string &name) { + bool watcher_feature_disabled(const std::string& name) { return watcher_disabled_features.find(name) != watcher_disabled_features.end(); } // Helper function to generate a message string when an environment variable has not been set - std::string generate_env_var_not_set_message(const std::string &env_var) const; + std::string generate_env_var_not_set_message(const std::string& env_var) const; }; -extern RunTimeOptions OptionsG; - } // namespace llrt } // namespace tt diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index e26404d9457c..05e8534af42c 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -516,8 +516,9 @@ void Cluster::write_core( const void *mem_ptr, uint32_t sz_in_bytes, tt_cxy_pair core, uint64_t addr, bool small_access) const { chip_id_t chip_id = core.chip; const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); - if (tt::llrt::OptionsG.get_watcher_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { tt::watcher_sanitize_host_noc_write(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {core.x, core.y}, addr, sz_in_bytes); + } tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); @@ -532,7 +533,7 @@ void Cluster::read_core( int chip_id = core.chip; const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); - if (tt::llrt::OptionsG.get_watcher_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { tt::watcher_sanitize_host_noc_read(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {core.x, core.y}, addr, size_in_bytes); } @@ -551,7 +552,7 @@ void Cluster::write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64 int chip_id = target.chip; const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); - if (tt::llrt::OptionsG.get_watcher_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { tt::watcher_sanitize_host_noc_write(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {target.x, target.y}, addr, size_in_bytes); } tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); @@ -566,7 +567,7 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr int chip_id = target.chip; const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); - if (tt::llrt::OptionsG.get_watcher_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_watcher_enabled()) { tt::watcher_sanitize_host_noc_read(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {target.x, target.y}, addr, size_in_bytes); } tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index 6c8b9ea18470..0fe0dc5767d4 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -399,7 +399,7 @@ void DeviceProfiler::dumpResults(Device* device, const std::vector& w const auto USE_FAST_DISPATCH = std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr; if (USE_FAST_DISPATCH) { if (lastDump) { - if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) { + if (tt::llrt::RunTimeOptions::get_instance().get_profiler_do_dispatch_cores()) { tt_metal::detail::ReadFromBuffer(output_dram_buffer, profile_buffer); } } else { @@ -494,7 +494,7 @@ void DeviceProfiler::pushTracyDeviceResults() { #endif } -bool getDeviceProfilerState() { return tt::llrt::OptionsG.get_profiler_enabled(); } +bool getDeviceProfilerState() { return tt::llrt::RunTimeOptions::get_instance().get_profiler_enabled(); } } // namespace tt_metal diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index 5c9df9b95265..60e1bb8869b4 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -85,7 +85,7 @@ void setControlBuffer(uint32_t device_id, std::vector& control_buffer) void syncDeviceHost( Device* device, CoreCoord logical_core, std::shared_ptr& sync_program, bool doHeader) { - if (!tt::llrt::OptionsG.get_profiler_sync_enabled()) { + if (!tt::llrt::RunTimeOptions::get_instance().get_profiler_sync_enabled()) { return; } ZoneScopedC(tracy::Color::Tomato3); @@ -347,7 +347,7 @@ void DumpDeviceProfileResults(Device* device, std::vector& worker_cor std::scoped_lock lock(device_mutex); const auto& dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id()); auto dispatch_core_type = dispatch_core_config.get_core_type(); - if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) { + if (tt::llrt::RunTimeOptions::get_instance().get_profiler_do_dispatch_cores()) { auto device_id = device->id(); auto device_num_hw_cqs = device->num_hw_cqs(); for (const CoreCoord& core : @@ -366,7 +366,7 @@ void DumpDeviceProfileResults(Device* device, std::vector& worker_cor Finish(device->command_queue()); } } else { - if (tt::llrt::OptionsG.get_profiler_do_dispatch_cores()) { + if (tt::llrt::RunTimeOptions::get_instance().get_profiler_do_dispatch_cores()) { bool waitForDispatch = true; uint8_t loopCount = 0; CoreCoord unfinishedCore = {0, 0}; diff --git a/tt_metal/tools/watcher_dump/watcher_dump.cpp b/tt_metal/tools/watcher_dump/watcher_dump.cpp index 7839e7eccad7..01a7870b8e32 100644 --- a/tt_metal/tools/watcher_dump/watcher_dump.cpp +++ b/tt_metal/tools/watcher_dump/watcher_dump.cpp @@ -26,13 +26,13 @@ void dump_data( bool eth_dispatch, int num_hw_cqs) { // Don't clear L1, this way we can dump the state. - llrt::OptionsG.set_clear_l1(false); + llrt::RunTimeOptions::get_instance().set_clear_l1(false); // Watcher should be disabled for this, so we don't (1) overwrite the kernel_names.txt and (2) do any other dumping // than the one we want. - llrt::OptionsG.set_watcher_enabled(false); + llrt::RunTimeOptions::get_instance().set_watcher_enabled(false); - std::filesystem::path parent_dir(tt::llrt::OptionsG.get_root_dir() + output_dir_name); + std::filesystem::path parent_dir(tt::llrt::RunTimeOptions::get_instance().get_root_dir() + output_dir_name); std::filesystem::path cq_dir(parent_dir.string() + "command_queue_dump/"); std::filesystem::create_directories(cq_dir); @@ -142,7 +142,7 @@ int main(int argc, char* argv[]) { } else if (s == "--dump-cqs-data") { dump_cqs_raw_data = true; } else if (s == "--dump-noc-transfer-data") { - tt::llrt::OptionsG.set_record_noc_transfers(true); + tt::llrt::RunTimeOptions::get_instance().set_record_noc_transfers(true); dump_noc_xfers = true; } else if (s == "--eth-dispatch") { eth_dispatch = true; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index c3e1b8b71bc8..e55dc27384fe 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -885,7 +885,7 @@ DeviceAddr AllocateBuffer(Buffer* buffer) { GraphTracker::instance().track_allocate(buffer); #if defined(TRACY_ENABLE) - if (tt::llrt::OptionsG.get_profiler_buffer_usage_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_profiler_buffer_usage_enabled()) { TracyAllocN( reinterpret_cast(allocated_addr), buffer->size(), @@ -902,7 +902,7 @@ void DeallocateBuffer(Buffer* buffer) { } #if defined(TRACY_ENABLE) - if (tt::llrt::OptionsG.get_profiler_buffer_usage_enabled()) { + if (tt::llrt::RunTimeOptions::get_instance().get_profiler_buffer_usage_enabled()) { TracyFreeN( reinterpret_cast(buffer->address()), get_buffer_location_name(buffer->buffer_type(), buffer->device()->id())); From 52dec89969b70ca6de776f7e42e2c7d61f3cc42c Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Tue, 10 Dec 2024 17:12:25 -0500 Subject: [PATCH 41/59] Dockerize BH pipeline (#15523) ### Ticket Progress towards #14393 ### Problem description To run our pipeline on 22.04, we need to first dockerize the steps (else we need a whole new fleet of build runners). ### What's changed Run the Blackhole post-commit within Docker. Still using 20.04, but sets the stage for 22.04. ### Checklist - [ ] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12263532830 - [ ] Blackhole Post commit (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/12263536331 - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../workflows/all-post-commit-workflows.yaml | 1 + .github/workflows/blackhole-post-commit.yaml | 4 +++ .github/workflows/build-and-unit-tests.yaml | 33 +++++++++++++------ .github/workflows/cpp-post-commit.yaml | 25 +++++++++++--- .../fast-dispatch-build-and-unit-tests.yaml | 11 ++++++- 5 files changed, 58 insertions(+), 16 deletions(-) diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml index 57bae427f2fe..82b5feb14aa3 100644 --- a/.github/workflows/all-post-commit-workflows.yaml +++ b/.github/workflows/all-post-commit-workflows.yaml @@ -108,6 +108,7 @@ jobs: ] uses: ./.github/workflows/fast-dispatch-build-and-unit-tests.yaml with: + os: ubuntu-20.04 arch: ${{ matrix.test-group.arch }} runner-label: ${{ matrix.test-group.runner-label }} # TTNN FD Unit tests diff --git a/.github/workflows/blackhole-post-commit.yaml b/.github/workflows/blackhole-post-commit.yaml index 8a09f14ecb2f..ba479a8b63cb 100644 --- a/.github/workflows/blackhole-post-commit.yaml +++ b/.github/workflows/blackhole-post-commit.yaml @@ -29,6 +29,7 @@ jobs: uses: ./.github/workflows/build-artifact.yaml secrets: inherit with: + os: "ubuntu-20.04-amd64" arch: '["blackhole"]' build-docker: false build-wheels: @@ -57,6 +58,7 @@ jobs: arch: blackhole runner-label: BH timeout: 30 + os: "ubuntu-20.04" fd-unit-tests: needs: build-wheels uses: ./.github/workflows/fast-dispatch-build-and-unit-tests.yaml @@ -64,6 +66,7 @@ jobs: with: arch: blackhole runner-label: BH + os: "ubuntu-20.04" # FD C++ Unit Tests cpp-unit-tests: needs: build-artifact @@ -73,6 +76,7 @@ jobs: arch: blackhole runner-label: BH timeout: 60 + os: "ubuntu-20.04" # profiler-regression: # needs: build-artifact-profiler diff --git a/.github/workflows/build-and-unit-tests.yaml b/.github/workflows/build-and-unit-tests.yaml index c17c38a4eb1c..e51dced1890b 100644 --- a/.github/workflows/build-and-unit-tests.yaml +++ b/.github/workflows/build-and-unit-tests.yaml @@ -13,6 +13,10 @@ on: required: false type: number default: 35 + os: + required: false + type: string + default: "ubuntu-20.04" workflow_dispatch: inputs: arch: @@ -34,6 +38,11 @@ on: required: false type: number default: 35 + os: + required: false + type: string + default: "ubuntu-20.04" + jobs: unit-tests-slow-dispatch: name: ${{ inputs.arch }} ${{ inputs.runner-label }} @@ -42,24 +51,28 @@ jobs: - cloud-virtual-machine - in-service env: - TT_METAL_ENV: ${{ vars.TT_METAL_ENV }} - ARCH_NAME: ${{ inputs.arch}} - TT_METAL_SLOW_DISPATCH_MODE: 1 + ARCH_NAME: ${{ inputs.arch }} LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib steps: - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - uses: ./.github/actions/prepare-metal-run with: arch: ${{ inputs.arch }} - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV - name: Run pre/post regression tests timeout-minutes: ${{ inputs.timeout }} - run: | - source ${{ github.workspace }}/python_env/bin/activate - ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type post_commit --dispatch-mode slow + uses: ./.github/actions/docker-run + with: + docker_os_arch: tt-metalium/${{ inputs.os }}-amd64 + docker_password: ${{ secrets.GITHUB_TOKEN }} + docker_opts: | + -e ARCH_NAME=${{ inputs.arch}} + -e TT_METAL_HOME=${{ github.workspace }} + -e TT_METAL_SLOW_DISPATCH_MODE=1 + -e LD_LIBRARY_PATH=${{ github.workspace }}/build/lib + run_args: | + python3 -m pip install -r $(pwd)/tt_metal/python_env/requirements-dev.txt + pip install -e . + ./tests/scripts/run_tests.sh --tt-arch ${{ inputs.arch }} --pipeline-type post_commit --dispatch-mode slow - uses: ./.github/actions/slack-report if: ${{ failure() }} with: diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml index ef34c1422349..c90c623cb766 100644 --- a/.github/workflows/cpp-post-commit.yaml +++ b/.github/workflows/cpp-post-commit.yaml @@ -13,6 +13,10 @@ on: required: false type: number default: 80 + os: + required: false + type: string + default: "ubuntu-20.04" workflow_dispatch: inputs: arch: @@ -34,6 +38,10 @@ on: required: false type: number default: 60 + os: + required: false + type: string + default: "ubuntu-20.04" jobs: models: @@ -67,11 +75,18 @@ jobs: arch: ${{ inputs.arch }} - name: ${{ matrix.test-group.name }} tests timeout-minutes: ${{ inputs.timeout }} - run: | - source ${{ github.workspace }}/python_env/bin/activate - cd $TT_METAL_HOME - export PYTHONPATH=$TT_METAL_HOME - ${{ matrix.test-group.cmd }} + uses: ./.github/actions/docker-run + with: + docker_os_arch: tt-metalium/${{ inputs.os }}-amd64 + docker_password: ${{ secrets.GITHUB_TOKEN }} + docker_opts: | + -e TT_METAL_HOME=${{ github.workspace }} + -e ARCH_NAME=${{ inputs.arch }} + -e LD_LIBRARY_PATH=${{ github.workspace }}/build/lib + run_args: | + python3 -m pip install -r $(pwd)/tt_metal/python_env/requirements-dev.txt + python3 -m pip install -e . + ${{ matrix.test-group.cmd }} - uses: ./.github/actions/slack-report if: ${{ failure() }} with: diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml index 8291c3dee520..2c55a9400341 100644 --- a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml +++ b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml @@ -13,6 +13,10 @@ on: required: false type: number default: 45 + os: + required: false + type: string + default: "ubuntu-20.04" workflow_dispatch: inputs: arch: @@ -34,6 +38,10 @@ on: required: false type: number default: 45 + os: + required: false + type: string + default: "ubuntu-20.04" jobs: fd-tests: @@ -42,7 +50,7 @@ jobs: # so we try not to get hanging machines fail-fast: false matrix: - os: ["ubuntu-20.04"] + os: ["${{ inputs.os }}"] test-group: [ {name: eager unit tests 1, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 1 }, {name: eager unit tests 2, cmd: pytest tests/tt_eager/python_api_testing/unit_testing/ -xvvv --splits 7 --group 2 }, @@ -70,6 +78,7 @@ jobs: timeout-minutes: ${{ inputs.timeout }} uses: ./.github/actions/docker-run with: + docker_os_arch: tt-metalium/${{ inputs.os }}-amd64 install_wheel: true docker_password: ${{ secrets.GITHUB_TOKEN }} run_args: | From a16bdf4c346cd033e7a2652e6ab20cbf86df4a2e Mon Sep 17 00:00:00 2001 From: Salar Hosseini Date: Tue, 10 Dec 2024 21:02:10 +0000 Subject: [PATCH 42/59] [Llama3-70b] Separate vllm generator class and add prompt length validation in input processor Signed-off-by: Salar Hosseini --- .../t3000/llama2_70b/tt/generator_vllm.py | 82 +++++++++++++++++++ .../t3000/llama2_70b/tt/llama_generation.py | 59 +------------ 2 files changed, 83 insertions(+), 58 deletions(-) create mode 100644 models/demos/t3000/llama2_70b/tt/generator_vllm.py diff --git a/models/demos/t3000/llama2_70b/tt/generator_vllm.py b/models/demos/t3000/llama2_70b/tt/generator_vllm.py new file mode 100644 index 000000000000..a65850844c8f --- /dev/null +++ b/models/demos/t3000/llama2_70b/tt/generator_vllm.py @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Union +from dataclasses import dataclass +from pathlib import Path +import json + +from models.demos.t3000.llama2_70b.tt.llama_generation import TtLlamaModelForGeneration +from models.demos.t3000.llama2_70b.tt.llama_common import ( + load_llama_state_dict, + setup_llama_env, + check_mesh_device, +) +from models.demos.t3000.llama2_70b.reference.llama.llama.model import ModelArgs as ReferenceModelArgs + +from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, EncoderDecoderInputs, InputContext + + +def input_processor_for_llama70b(ctx: InputContext, inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs]): + prompt_len = len(inputs.get("prompt_token_ids")) + if prompt_len >= 32768: + raise ValueError( + f"TT LLama70b does not yet support prompts longer than 32768 tokens (received prompt with {prompt_len} tokens)" + ) + + return inputs + + +@INPUT_REGISTRY.register_input_processor(input_processor_for_llama70b) +class TtLlamaForCausalLM(TtLlamaModelForGeneration): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @classmethod + def initialize_vllm_model(cls, hf_config, t3k_mesh_device, max_batch_size): + # TODO: pass in model args and tt args as parameters from vllm + @dataclass + class ModelArgs: + llama_version: str = None + ckpt_dir: str = None + max_batch_size: int = 32 # overwritten by max_num_seqs from vllm + num_layers: int = 80 + max_kv_context_len: int = 131072 + + @dataclass + class TTArgs: + mesh_device: object = None + cache_path: str = None + + # setup configs + llama_version = "llama3" + model_config, ckpt_dir, _, cache_path = setup_llama_env( + llama_version=llama_version, + ) + check_mesh_device(t3k_mesh_device, model_config) + + # initialize arg classes + model_args = ModelArgs(llama_version=llama_version, ckpt_dir=ckpt_dir, max_batch_size=max_batch_size) + tt_args = TTArgs(mesh_device=t3k_mesh_device, cache_path=cache_path) + + # load state dict + state_dict = load_llama_state_dict(model_args.ckpt_dir, n_layers=model_args.num_layers) + + # TODO: delete this configuration setup once llama can directly accept hf_config + + with open(Path(ckpt_dir) / "params.json", "r") as f: + params = json.loads(f.read()) + configuration = ReferenceModelArgs( + max_seq_len=model_args.max_kv_context_len, + max_batch_size=model_args.max_batch_size, + **params, + ) + + return cls( + configuration=configuration, state_dict=state_dict, model_args=model_args, tt_args=tt_args, vllm=True + ) + + @property + def cache_path(self): + return self.tt_model.cache_path diff --git a/models/demos/t3000/llama2_70b/tt/llama_generation.py b/models/demos/t3000/llama2_70b/tt/llama_generation.py index b4a038fc9c69..c74423249678 100644 --- a/models/demos/t3000/llama2_70b/tt/llama_generation.py +++ b/models/demos/t3000/llama2_70b/tt/llama_generation.py @@ -7,17 +7,11 @@ import ttnn from ttnn import ConcatMeshToTensor, ReplicateTensorToMesh -from dataclasses import dataclass from loguru import logger import copy from models.demos.t3000.llama2_70b.tt.llama_model_optimized import TtLlamaModel_optimized as TtLlamaModel -from models.demos.t3000.llama2_70b.tt.llama_common import ( - BASE_URL, - load_llama_state_dict, - setup_llama_env, - check_mesh_device, -) +from models.demos.t3000.llama2_70b.tt.llama_common import BASE_URL from models.demos.t3000.llama2_70b.tt.model_config import ( get_model_config, ) @@ -61,57 +55,6 @@ def __init__(self, configuration, state_dict, model_args, tt_args, paged_attenti del state_dict - @classmethod - def initialize_vllm_model(cls, hf_config, t3k_mesh_device, max_batch_size): - # TODO: pass in model args and tt args as parameters from vllm - @dataclass - class ModelArgs: - llama_version: str = None - ckpt_dir: str = None - max_batch_size: int = 32 # overwritten by max_num_seqs from vllm - num_layers: int = 80 - max_kv_context_len: int = 131072 - - @dataclass - class TTArgs: - mesh_device: object = None - cache_path: str = None - - # setup configs - llama_version = "llama3" - model_config, ckpt_dir, _, cache_path = setup_llama_env( - llama_version=llama_version, - ) - check_mesh_device(t3k_mesh_device, model_config) - - # initialize arg classes - model_args = ModelArgs(llama_version=llama_version, ckpt_dir=ckpt_dir, max_batch_size=max_batch_size) - tt_args = TTArgs(mesh_device=t3k_mesh_device, cache_path=cache_path) - - # load state dict - state_dict = load_llama_state_dict(model_args.ckpt_dir, n_layers=model_args.num_layers) - - # TODO: delete this configuration setup once llama can directly accept hf_config - from models.demos.t3000.llama2_70b.reference.llama.llama.model import ModelArgs as ReferenceModelArgs - from pathlib import Path - import json - - with open(Path(ckpt_dir) / "params.json", "r") as f: - params = json.loads(f.read()) - configuration = ReferenceModelArgs( - max_seq_len=model_args.max_kv_context_len, - max_batch_size=model_args.max_batch_size, - **params, - ) - - return cls( - configuration=configuration, state_dict=state_dict, model_args=model_args, tt_args=tt_args, vllm=True - ) - - @property - def cache_path(self): - return self.tt_model.cache_path - def forward(self, tokens: torch.Tensor, start_pos: int, page_table=None, kv_cache=None, prompt_lens=None): _, seq_len = tokens.shape if seq_len == 1: From eb9f4c342878c21234655bdebc4623f3c0b96b63 Mon Sep 17 00:00:00 2001 From: Juan Camilo Vega Date: Tue, 10 Dec 2024 17:22:19 -0500 Subject: [PATCH 43/59] #15863: Implementing the view operation (#15865) ### Ticket https://github.com/tenstorrent/tt-metal/issues/15863 ### Problem description Models team has asked for a 0-cost view op which asserts out if a 0 cost view is not possible for that request ### What's changed Implemented 0-cost view that provides the proper input validation and then calls the perform_view function in reshape ### Checklist - [ X ] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12259122634 https://github.com/tenstorrent/tt-metal/actions/runs/12263562979 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- tests/ttnn/unit_tests/test_view.py | 71 +++++++++++++++++++ ttnn/CMakeLists.txt | 2 + .../data_movement/data_movement_pybind.hpp | 2 + .../data_movement/reshape_view/reshape.hpp | 1 + .../operations/data_movement/view/view.cpp | 52 ++++++++++++++ .../operations/data_movement/view/view.hpp | 19 +++++ .../data_movement/view/view_pybind.cpp | 69 ++++++++++++++++++ .../data_movement/view/view_pybind.hpp | 13 ++++ 8 files changed, 229 insertions(+) create mode 100644 tests/ttnn/unit_tests/test_view.py create mode 100644 ttnn/cpp/ttnn/operations/data_movement/view/view.cpp create mode 100644 ttnn/cpp/ttnn/operations/data_movement/view/view.hpp create mode 100644 ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.cpp create mode 100644 ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.hpp diff --git a/tests/ttnn/unit_tests/test_view.py b/tests/ttnn/unit_tests/test_view.py new file mode 100644 index 000000000000..d73333565267 --- /dev/null +++ b/tests/ttnn/unit_tests/test_view.py @@ -0,0 +1,71 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +import torch + +import ttnn + +from tests.ttnn.utils_for_testing import assert_with_pcc + + +# Reshape in Tile layout with shapes that are not divisible by 32 +@pytest.mark.parametrize( + "input_shape, output_shape, layout", + [ + ((1, 15), (15,), ttnn.ROW_MAJOR_LAYOUT), # RM_last dimension matches, 1D output + ((2, 1, 1, 1, 15), (2, 15), ttnn.ROW_MAJOR_LAYOUT), # RM_last dimension matches + ((16, 1, 1, 247, 13), (1, 16, 247, 13), ttnn.TILE_LAYOUT), # last two dimensions match + ( + (16, 1, 1, 256, 16), + (8, 16, 32, 16), + ttnn.TILE_LAYOUT, + ), # last dimension match but second last multiple of 32 but does not match + ((32, 32, 32, 15), (32768, 15), ttnn.TILE_LAYOUT), # Very large tensor + ], +) +def test_view(input_shape, output_shape, layout, device): + torch_input_tensor = torch.randn(input_shape, dtype=torch.bfloat16) + torch_result = torch_input_tensor.reshape(output_shape) + + input_tensor = ttnn.from_torch(torch_input_tensor, layout=layout, dtype=ttnn.bfloat16, device=device) + ttnn_output = ttnn.view(input_tensor, output_shape) + assert layout == ttnn_output.layout + output = ttnn.to_torch(ttnn_output) + assert_with_pcc(torch_result, output, 0.9999) + + +@pytest.mark.parametrize( + "input_shape, output_shape, layout", + [ + ((2, 1, 1, 1, 15), (1, 30), ttnn.ROW_MAJOR_LAYOUT), # RM last dimension doesn't match + ( + (16, 1, 256, 1, 16), + (8, 16, 32, 16), + ttnn.TILE_LAYOUT, + ), # TILE last dimension match but second last does not match, shape mult of 32 only + ( + (16, 1, 1, 256, 16), + (8, 16, 32, 1, 16), + ttnn.TILE_LAYOUT, + ), # TILE last dimension match but second last does not match, tensor mult of 32 only + ( + (256, 1, 1, 16, 16), + (8, 16, 32, 1, 16), + ttnn.TILE_LAYOUT, + ), # TILE last dimension match but second last does not match, none mult of 32 + ( + (16, 8, 1, 32, 16), + (8, 16, 31, 16), + ttnn.TILE_LAYOUT, + ), # Volume doesn't match but padded volume does + ], +) +def test_invalid_cases(input_shape, output_shape, layout, device): + # Verifies invalid cases do cause an assertion + torch_input_tensor = torch.rand(input_shape, dtype=torch.bfloat16) + input_tensor = ttnn.from_torch(torch_input_tensor, dtype=ttnn.bfloat16, layout=layout, device=device) + with pytest.raises(RuntimeError): + ttnn.view(input_tensor, output_shape) diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 1da988236ab1..42dbd511632a 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -76,6 +76,8 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_on_device/reshape_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/view/view.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/view/view_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_view/device/reshape_rm_op.cpp diff --git a/ttnn/cpp/ttnn/operations/data_movement/data_movement_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/data_movement_pybind.hpp index 00618e0fb6c9..af20dc2aa510 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/data_movement_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/data_movement_pybind.hpp @@ -27,6 +27,7 @@ #include "ttnn/operations/data_movement/repeat_interleave/repeat_interleave_pybind.hpp" #include "ttnn/operations/data_movement/reshape_on_device/reshape_pybind.hpp" #include "ttnn/operations/data_movement/reshape_view/reshape_pybind.hpp" +#include "ttnn/operations/data_movement/view/view_pybind.hpp" #include "ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/interleaved_to_sharded_partial_pybind.hpp" #include "ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/sharded_to_interleaved_partial_pybind.hpp" #include "ttnn/operations/data_movement/slice/slice_pybind.hpp" @@ -75,6 +76,7 @@ void py_module(py::module& module) { py_bind_repeat(module); py_bind_reshape(module); py_bind_reshape_view(module); + py_bind_view(module); py_bind_reshard(module); py_bind_sharded_to_interleaved(module); py_bind_sharded_to_interleaved_partial(module); diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp index 566a0d1250c4..69badec0c22a 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp @@ -19,6 +19,7 @@ namespace detail { ttnn::Tensor convert_tile_to_rm(const ttnn::Tensor& tensor, const ttnn::Shape& shape, const uint32_t tile_first_dim, const uint32_t tile_second_dim, const MemoryConfig &memory_config, const uint8_t queue_id, const PadValue &pad_value); } +ttnn::Shape shape_corrector(const ttnn::Tensor& tensor, const ttnn::Shape& shape); ttnn::Shape tiling_reshape_corrector(const ttnn::Shape& shape); ttnn::Tensor PerformView(const ttnn::Tensor& tensor, const ttnn::Shape& shapeconst, uint32_t tile_first_dim, const uint32_t tile_second_dim); void Validate_transform (const ttnn::Shape& input_shape, const ttnn::Shape& output_shape); diff --git a/ttnn/cpp/ttnn/operations/data_movement/view/view.cpp b/ttnn/cpp/ttnn/operations/data_movement/view/view.cpp new file mode 100644 index 000000000000..22d809822661 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/data_movement/view/view.cpp @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "view.hpp" +#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp" + +namespace ttnn::operations::data_movement { + +ttnn::Tensor ViewOperation::invoke(const ttnn::Tensor& tensor, const ttnn::SimpleShape& shape) { + auto layout = tensor.get_layout(); + auto tensor_shape = tensor.get_shape(); + // First Case, No reshape Required + if (tensor_shape == shape) { + return tensor; + } + + const uint32_t tile_first_dim = tensor.get_tensor_spec().tile().get_width(); + const uint32_t tile_second_dim = tensor.get_tensor_spec().tile().get_height(); + const uint32_t shape_second_last_dim = shape.rank() >= 2 ? shape[-2] : 1; + const uint32_t tensor_shape_second_last_dim = tensor_shape.rank() >= 2 ? tensor_shape[-2] : 1; + // Validate the operation + TT_FATAL( + shape.volume() == tensor.get_logical_volume(), + "Invalid view, logical volumes are changing from {} to {}", + tensor.get_logical_volume(), + shape.volume()); + TT_FATAL( + ttnn::has_storage_type_of(tensor, ttnn::StorageType::DEVICE), + "View requires the tensor be stored on device, use reshape instead"); + TT_FATAL( + (tensor_shape[-1] == shape[-1]), + "The last dimension can not change in view, attempting to change last dimension from {} to {}, use reshape " + "instead", + tensor_shape[-1], + shape[-1]); + TT_FATAL( + (tensor.get_layout() == ttnn::ROW_MAJOR_LAYOUT) || // Its row major + (tensor_shape_second_last_dim == shape_second_last_dim) || // Second last dimension is the same + ((shape_second_last_dim % tile_second_dim == 0) && (tensor_shape_second_last_dim % tile_second_dim == 0)), + "Invalid second last dims for TILED reshape, from {} to {}, use reshape instead\n", + tensor_shape_second_last_dim, + shape_second_last_dim); + // Perform the View + return PerformView(tensor, shape, tile_first_dim, tile_second_dim); +} + +ttnn::Tensor ViewOperation::invoke(const ttnn::Tensor& tensor, tt::stl::Span shape_vector) { + return invoke(tensor, tt::tt_metal::infer_dims_for_reshape(tensor, shape_vector)); +} + +} // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/view/view.hpp b/ttnn/cpp/ttnn/operations/data_movement/view/view.hpp new file mode 100644 index 000000000000..29057991ec57 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/data_movement/view/view.hpp @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ttnn/decorators.hpp" + +namespace ttnn { +namespace operations::data_movement { + +struct ViewOperation { + static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, const ttnn::SimpleShape& logical_shape); + static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, tt::stl::Span shape_vector); +}; + +} // namespace operations::data_movement +constexpr auto view = ttnn::register_operation<"ttnn::view", ttnn::operations::data_movement::ViewOperation>(); +} // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.cpp new file mode 100644 index 000000000000..5e5323342d16 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.cpp @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "view_pybind.hpp" + +#include +#include + +#include "ttnn/cpp/pybind11/decorators.hpp" +#include "ttnn/operations/data_movement/view/view.hpp" +#include "ttnn/types.hpp" + +namespace ttnn::operations::data_movement { + +namespace detail { + +template +void bind_view(pybind11::module& module, const data_movement_operation_t& operation, const char* doc) { + bind_registered_operation( + module, + operation, + doc, + ttnn::pybind_overload_t{ + [](const data_movement_operation_t& self, const ttnn::Tensor& input_tensor, const ttnn::SimpleShape& shape) + -> ttnn::Tensor { return self(input_tensor, shape); }, + py::arg("input_tensor"), + py::arg("shape"), + }, + ttnn::pybind_overload_t{ + [](const data_movement_operation_t& self, + const ttnn::Tensor& input_tensor, + const ttnn::SmallVector shape) -> ttnn::Tensor { return self(input_tensor, shape); }, + py::arg("input_tensor"), + py::arg("shape"), + }); +} + +} // namespace detail + +void py_bind_view(pybind11::module& module) { + detail::bind_view( + module, + ttnn::view, + R"doc( + + This is a 0 cost view operation that returns the same tensor that was passed to it but with a new shape + + Note: The following conditions must be met: + * the memory is stored on the device + * the last dimension must not change + * In Tiled the second last two dimensions must not change OR there is no padding on the second last dimension + + Args: + * input_tensor: Input Tensor. + * new_shape: New shape of tensor. + + Returns: + ttnn.Tensor: a reference to the input tensor but with the new shape. + + Example: + + >>> tensor = ttnn.from_torch(torch.tensor((2, 1, 1, 1, 4), dtype=torch.bfloat16), device=device) + >>> output = ttnn.view(tensor, (2, 1, 4)) + + )doc"); + +} // namespace ttnn::operations::data_movement +} // namespace ttnn::operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.hpp new file mode 100644 index 000000000000..ff70a4e5437f --- /dev/null +++ b/ttnn/cpp/ttnn/operations/data_movement/view/view_pybind.hpp @@ -0,0 +1,13 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "pybind11/pybind_fwd.hpp" + +namespace ttnn::operations::data_movement { + +void py_bind_view(pybind11::module& module); + +} // namespace ttnn::operations::data_movement From 23ce00c668d7bff14c98657d1245f58a65fe0cef Mon Sep 17 00:00:00 2001 From: asaigal Date: Tue, 10 Dec 2024 22:45:10 +0000 Subject: [PATCH 44/59] #0: Resolve BH failure after Virtual Coord Changes --- .../debug_tools/watcher/test_noc_sanitize.cpp | 2 +- .../perf_microbenchmark/dispatch/common.h | 6 +-- tt_metal/impl/device/device.cpp | 43 +++++++++++++------ tt_metal/impl/device/device.hpp | 5 ++- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp index 0ac4f6ce2670..dc3624789cf9 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp @@ -130,7 +130,7 @@ void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, boo // We should be able to find the expected watcher error in the log as well. string expected; int noc = (use_ncrisc) ? 1 : 0; - CoreCoord target_core = device->virtual_noc_coordinate(noc, input_buf_noc_xy); + CoreCoord target_core = device->virtual_noc0_coordinate(noc, input_buf_noc_xy); string risc_name = (is_eth_core) ? "erisc" : "brisc"; if (use_ncrisc) { risc_name = "ncrisc"; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 08661f7d616c..00868b341a8c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -534,9 +534,9 @@ void configure_kernel_variant( NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index) { - auto my_virtual_noc_coords = device->virtual_noc_coordinate(my_noc_index, phys_my_core); - auto upstream_virtual_noc_coords = device->virtual_noc_coordinate(upstream_noc_index, phys_upstream_core); - auto downstream_virtual_noc_coords = device->virtual_noc_coordinate(downstream_noc_index, phys_downstream_core); + auto my_virtual_noc_coords = device->virtual_noc0_coordinate(my_noc_index, phys_my_core); + auto upstream_virtual_noc_coords = device->virtual_noc0_coordinate(upstream_noc_index, phys_upstream_core); + auto downstream_virtual_noc_coords = device->virtual_noc0_coordinate(downstream_noc_index, phys_downstream_core); std::map defines = { {"DISPATCH_KERNEL", "1"}, diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index e2aaf7fec52b..5c2875b48855 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -837,10 +837,10 @@ void Device::configure_kernel_variant( is_active_eth_core ? hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) : hal.get_programmable_core_type_index(HalProgrammableCoreType::IDLE_ETH); - auto my_virtual_noc_coords = this->virtual_noc_coordinate(my_noc_index, kernel_virtual_core); - auto upstream_virtual_noc_coords = this->virtual_noc_coordinate(upstream_noc_index, upstream_virtual_core); - auto downstream_virtual_noc_coords = this->virtual_noc_coordinate(downstream_noc_index, downstream_virtual_core); - auto downstream_slave_virtual_noc_coords = this->virtual_noc_coordinate(downstream_noc_index, downstream_slave_virtual_core); + auto my_virtual_noc_coords = this->virtual_noc0_coordinate(my_noc_index, kernel_virtual_core); + auto upstream_virtual_noc_coords = this->virtual_noc0_coordinate(upstream_noc_index, upstream_virtual_core); + auto downstream_virtual_noc_coords = this->virtual_noc0_coordinate(downstream_noc_index, downstream_virtual_core); + auto downstream_slave_virtual_noc_coords = this->virtual_noc0_coordinate(downstream_noc_index, downstream_slave_virtual_core); std::map defines = { {"DISPATCH_KERNEL", "1"}, @@ -3142,18 +3142,37 @@ CoreType Device::core_type_from_virtual_core(const CoreCoord &virtual_coord) con } -CoreCoord Device::virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const { +CoreCoord Device::virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const { if (coord.x >= this->grid_size().x || coord.y >= this->grid_size().y) { // Coordinate already in virtual space: NOC0 and NOC1 are the same return coord; } else { const auto& grid_size = this->grid_size(); - // Coordinate in Physical Space. Convert to Virtual. - CoreCoord phys_coord = { + // Coordinate in Physical NOC0 Space. Convert to Virtual. + coord = this->virtual_core_from_physical_core(coord, this->core_type_from_physical_core(coord)); + // Derive virtual coord in noc_index space. + CoreCoord virtual_coord = { + hal.noc_coordinate(noc_index, grid_size.x, coord.x), + hal.noc_coordinate(noc_index, grid_size.y, coord.y) + }; + return virtual_coord; + } +} + +CoreCoord Device::virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const { + if (coord.x >= this->grid_size().x || coord.y >= this->grid_size().y) { + // Coordinate already in virtual space: NOC0 and NOC1 are the same + return coord; + } else { + const auto& grid_size = this->grid_size(); + // Coordinate passed in can be NOC0 or NOC1. The noc_index corresponds to + // the system this coordinate belongs to. + // Use this to convert to NOC0 coordinates and then derive Virtual Coords from it. + CoreCoord physical_coord = { hal.noc_coordinate(noc_index, grid_size.x, coord.x), hal.noc_coordinate(noc_index, grid_size.y, coord.y) }; - return this->virtual_core_from_physical_core(phys_coord, this->core_type_from_physical_core(phys_coord)); + return this->virtual_core_from_physical_core(physical_coord, this->core_type_from_physical_core(physical_coord)); } } @@ -3198,7 +3217,7 @@ CoreCoord Device::logical_core_from_ethernet_core(const CoreCoord ðernet_core } uint32_t Device::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const { - auto virtual_noc_coord = this->virtual_noc_coordinate(noc_index, core); + auto virtual_noc_coord = this->virtual_noc0_coordinate(noc_index, core); return tt::tt_metal::hal.noc_xy_encoding( virtual_noc_coord.x, virtual_noc_coord.y @@ -3206,8 +3225,8 @@ uint32_t Device::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& co } uint32_t Device::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const { - auto virtual_noc_start = this->virtual_noc_coordinate(noc_index, cores.start_coord); - auto virtual_noc_end = this->virtual_noc_coordinate(noc_index, cores.end_coord); + auto virtual_noc_start = this->virtual_noc0_coordinate(noc_index, cores.start_coord); + auto virtual_noc_end = this->virtual_noc0_coordinate(noc_index, cores.end_coord); // NOC 1 mcasts from bottom left to top right, so we need to reverse the coords if (noc_index == 0) { @@ -3649,7 +3668,7 @@ void Device::generate_device_bank_to_noc_tables() l1_bank_to_noc_xy_.reserve(tt::tt_metal::hal.get_num_nocs() * l1_noc_coord_per_bank.size()); for (unsigned int noc = 0; noc < tt::tt_metal::hal.get_num_nocs(); noc++) { for (unsigned int bank_id = 0; bank_id < l1_noc_coord_per_bank.size(); bank_id++) { - auto l1_noc_coords = this->virtual_noc_coordinate(noc, l1_noc_coord_per_bank[bank_id]); + auto l1_noc_coords = this->virtual_noc0_coordinate(noc, l1_noc_coord_per_bank[bank_id]); uint16_t noc_x = l1_noc_coords.x; uint16_t noc_y = l1_noc_coords.y; uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << NOC_COORD_REG_OFFSET; diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index f2ef56a3a830..a8cdb1f23b0a 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -116,9 +116,10 @@ class Device { CoreCoord dram_grid_size() const; CoreType core_type_from_virtual_core(const CoreCoord& virtual_coord) const; - + // Given a Virtual coordinate in noc_index space, get the equivalent coordinate in Virtual NOC0 space CoreCoord virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const; - + // Given a coordinate in Virtual NOC0 Space, get the equivalent coordinate in Virtual noc_index space + CoreCoord virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const; std::vector worker_cores_from_logical_cores(const std::vector &logical_cores) const; std::vector ethernet_cores_from_logical_cores(const std::vector &logical_cores) const; std::vector get_optimal_dram_bank_to_logical_worker_assignment(); From df84ba5d1f33460f24d15b93b9f321a12758f00d Mon Sep 17 00:00:00 2001 From: asaigal Date: Wed, 11 Dec 2024 00:49:58 +0000 Subject: [PATCH 45/59] #0: Resolve Versim failure after Virtual Coordinates commit --- tt_metal/llrt/tt_cluster.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index 05e8534af42c..bf056fd446a6 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -425,7 +425,7 @@ tt_cxy_pair Cluster::get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair } CoreCoord Cluster::get_virtual_coordinate_from_physical_coordinates(chip_id_t chip_id, CoreCoord physical_coord, const CoreType& core_type) const { auto& soc_desc = this->get_soc_desc(chip_id); - if (not (core_type == CoreType::WORKER or core_type == CoreType::ETH)) { + if ((not (core_type == CoreType::WORKER or core_type == CoreType::ETH)) or this->target_type_ == TargetDevice::Simulator) { return physical_coord; } tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(tt_cxy_pair(chip_id, physical_coord.x, physical_coord.y)); From 2f59d5e02ef0da955b99cf0f49da1759c772a67a Mon Sep 17 00:00:00 2001 From: Andrew Fuller Date: Tue, 10 Dec 2024 23:34:06 -0500 Subject: [PATCH 46/59] Clang-tidy incremental scan for branches (#15888) ### Ticket closes #15730 ### Problem description We're really tight on resources, try to be faster, but without sacrificing correctness. ### What's changed When running on a branch (not `main`), and not expressly asked to do a "full scan" (full scan is useful when updating `.clang-tidy` config file), then do the following: 1. Build everything on merge-base sans clang-tidy 2. Checkout the branch in question 3. Build everything with clang-tidy. Ninja will only act on the delta. ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .github/workflows/code-analysis.yaml | 99 ++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 7 deletions(-) diff --git a/.github/workflows/code-analysis.yaml b/.github/workflows/code-analysis.yaml index 462f250853f2..9e4d695877d3 100644 --- a/.github/workflows/code-analysis.yaml +++ b/.github/workflows/code-analysis.yaml @@ -7,12 +7,20 @@ on: required: false type: string default: "ubuntu-22.04-amd64" + full-scan: + required: false + type: boolean + default: false workflow_dispatch: inputs: os: required: false type: string default: "ubuntu-22.04-amd64" + full-scan: + required: false + type: boolean + default: false jobs: build-docker-image: @@ -41,15 +49,11 @@ jobs: echo "::error title=ccache-not-provisioned::Ccache is not properly provisioned." exit 1 fi - - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - name: Set up dynamic env vars for build run: | echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV echo "RUNNER_UID=$(id -u)" >> $GITHUB_ENV echo "RUNNER_GID=$(id -g)" >> $GITHUB_ENV - - name: Update submodules - run: | - git submodule update --init --recursive - name: Generate docker tag id: generate-docker-tag uses: ./.github/actions/generate-docker-tag @@ -63,6 +67,84 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Pull docker image run: docker pull ${{ env.TT_METAL_DOCKER_IMAGE_TAG }} + + - name: Check out repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: recursive + clean: true + + - name: Determine merge base + if: github.ref_name != 'main' && !inputs.full-scan + run: | + echo "Current branch: ${{ github.ref_name }}" + MERGE_BASE=$(git merge-base ${{ github.ref_name }} origin/main) + echo "Merge base between ${{ github.ref_name }} and main: $MERGE_BASE" + echo "MERGE_BASE=$MERGE_BASE" >> $GITHUB_ENV + + - name: Check out baseline + if: github.ref_name != 'main' && !inputs.full-scan + uses: actions/checkout@v4 + with: + ref: ${{ env.MERGE_BASE }} + fetch-depth: 0 + submodules: recursive + clean: true + + - name: Create baseline + if: github.ref_name != 'main' && !inputs.full-scan + uses: addnab/docker-run-action@v3 + with: + image: ${{ env.TT_METAL_DOCKER_IMAGE_TAG }} + options: | + --rm + --tmpfs /tmp + -u ${{ env.RUNNER_UID }}:${{ env.RUNNER_GID }} + --group-add 1457 + -v ${{ github.workspace }}:${{ github.workspace }} + -v /etc/passwd:/etc/passwd:ro + -v /etc/shadow:/etc/shadow:ro + -v /etc/bashrc:/etc/bashrc:ro + -v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache + -v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache + -e ARCH_NAME=${{ env.ARCH_NAME }} + -e CARGO_HOME=${{ github.workspace }}/.cargo + -w ${{ github.workspace }} + run: | + set -eu # basic shell hygiene + + # /tmp is a tmpfs; more efficient than persisted storage + mkdir -p /tmp/ccache + export CCACHE_TEMPDIR=/tmp/ccache + + # Zero out the stats so we can see how we did this build + # NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache + ccache -z + + # Suppress clang-tidy to first get an up-to-date build tree + ln -sf /usr/bin/true ./clang-tidy-shim + + cmake --preset clang-tidy -DCMAKE_CXX_CLANG_TIDY=$(pwd)/clang-tidy-shim -DCMAKE_C_CLANG_TIDY=$(pwd)/clang-tidy-shim + nice -n 19 cmake --build --preset clang-tidy + + mkdir -p out + ccache -s > out/ccache.stats + + - name: Publish Ccache summary + if: github.ref_name != 'main' && !inputs.full-scan + run: | + echo '## CCache Summary (baseline)' >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + cat out/ccache.stats >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + + - name: Checkout repo + uses: actions/checkout@v4 + with: + submodules: recursive + clean: false + - name: Analyze code with clang-tidy uses: addnab/docker-run-action@v3 with: @@ -92,10 +174,13 @@ jobs: # NOTE: may be inaccurate if we have >1 build runner on the same machine, using the same local cache ccache -z - cmake --preset clang-tidy - # cmake -B .build/clang-tidy -G Ninja -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_CLANG_TIDY=clang-tidy-17 -DTT_UNITY_BUILDS=FALSE -DCMAKE_DISABLE_PRECOMPILE_HEADERS=TRUE -DENABLE_CCACHE=TRUE -DTT_METAL_BUILD_TESTS=TRUE -DTTNN_BUILD_TESTS=TRUE -DBUILD_PROGRAMMING_EXAMPLES=TRUE -DBUILD_TT_TRAIN=TRUE + # Restore shim to legit clang-tidy + # Symlink tomfoolery here so that Ninja believes the build command has not changed from the previous run + ln -sf $(which clang-tidy-17) ./clang-tidy-shim + + cmake --preset clang-tidy -DCMAKE_CXX_CLANG_TIDY=$(pwd)/clang-tidy-shim -DCMAKE_C_CLANG_TIDY=$(pwd)/clang-tidy-shim nice -n 19 cmake --build --preset clang-tidy - mkdir out + mkdir -p out ccache -s > out/ccache.stats - name: Publish Ccache summary run: | From b959916b5ae1f45603d79c6b05776c632ea8aa6f Mon Sep 17 00:00:00 2001 From: Kalaivani Baskar <156762498+KalaivaniMCW@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:39:12 +0530 Subject: [PATCH 47/59] #15483: Binary sfpu ops for Float32 precision (#15805) ### Ticket Link to Github Issue #15483 #14825 #13857 #13582 #13754 ### Problem description To enable Float32 precision support for elementwise binary ops ### What's changed This is PR implements the new LLK functionalities from https://github.com/tenstorrent/tt-metal/pull/15782 for TTNN apis For element-wise binary ops, FP32 precision is not possible during the current FPU operations, hence we introduce binary_sfpu_ops which gives Float32 support for binary operation by running the operation in SFPU. 1. enabled Float32 precision support for elementwise binary ops 2. added support for binary pow op 3. added support for 3 binary bitwise ops 4. added support for binary rsub op 5. Pow - pytorch2 sweep test : image ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12238579964 https://github.com/tenstorrent/tt-metal/actions/runs/12270807620 - [x] Blackhole Post commit (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/12238601878 - [ ] Nightly FD - https://github.com/tenstorrent/tt-metal/actions/runs/12235128037/attempts/1 https://github.com/tenstorrent/tt-metal/actions/runs/12238605626 - [x] SC Model Perf testing passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/12235130984 - [x] SC Device performance regression CI testing passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/12235124425 - [x] SC Demo tests - https://github.com/tenstorrent/tt-metal/actions/runs/12256052513 - [ ] Test for new models - https://github.com/tenstorrent/tt-metal/actions/runs/12256106557 - [x] New/Existing tests provide coverage for changes --- .../binary/pow/pow_scalar_pytorch2.py | 12 +- .../sweep_tests/pytests/tt_dnn/test_div.py | 1 + .../sweep_tests/pytests/tt_dnn/test_fmod.py | 2 + .../eltwise/test_binary_composite.py | 7 +- .../operations/eltwise/test_binary_fp32.py | 543 ++++++++++++++++++ .../unit_tests/operations/eltwise/test_pow.py | 129 +++++ .../unit_tests/operations/eltwise/test_sub.py | 33 ++ ttnn/CMakeLists.txt | 1 + .../ttnn/operations/eltwise/binary/binary.cpp | 93 ++- .../ttnn/operations/eltwise/binary/binary.hpp | 68 ++- .../eltwise/binary/binary_composite.hpp | 122 ++++ .../eltwise/binary/binary_pybind.hpp | 245 ++++++++ .../eltwise/binary/common/binary_op_types.hpp | 7 +- .../eltwise/binary/common/binary_op_utils.cpp | 130 +++++ .../eltwise/binary/common/binary_op_utils.hpp | 6 + .../binary/device/binary_composite_op.cpp | 256 ++++++++- .../binary/device/binary_device_operation.cpp | 46 +- .../binary/device/binary_device_operation.hpp | 27 + ...ement_wise_multi_core_sfpu_pgm_factory.cpp | 521 +++++++++++++++++ .../compute/eltwise_binary_sfpu_kernel.cpp | 145 +++++ .../ternary_backward/ternary_backward.cpp | 2 +- .../unary/device/unary_composite_op.cpp | 61 +- .../unary/device/unary_composite_op.hpp | 22 - .../ttnn/operations/eltwise/unary/unary.hpp | 8 +- .../eltwise/unary/unary_composite.hpp | 71 +-- .../operations/eltwise/unary/unary_pybind.hpp | 49 +- .../eltwise/unary_backward/unary_backward.cpp | 1 + 27 files changed, 2489 insertions(+), 119 deletions(-) create mode 100644 tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py create mode 100644 ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp create mode 100644 ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp diff --git a/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_scalar_pytorch2.py b/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_scalar_pytorch2.py index 6ca277010cb0..6c4851e783df 100644 --- a/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_scalar_pytorch2.py +++ b/tests/sweep_framework/sweeps/eltwise/composite/binary/pow/pow_scalar_pytorch2.py @@ -25,8 +25,8 @@ "input_shape": [ {"value": 10000, "shape": [128]}, ], - "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], - "input_b_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], + "input_a_dtype": [ttnn.bfloat16], + "input_b_dtype": [ttnn.bfloat16], "input_a_layout": [ttnn.TILE_LAYOUT], "input_b_layout": [ttnn.TILE_LAYOUT], "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], @@ -54,12 +54,12 @@ def run( ) -> list: torch.manual_seed(0) torch_input_tensor_a = gen_func_with_cast_tt( - partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype + partial(torch_random, low=-10, high=10, dtype=torch.bfloat16), input_a_dtype )(input_shape["shape"]) value = input_shape["value"] golden_function = ttnn.get_golden_function(ttnn.pow) - torch_output_tensor = golden_function(torch_input_tensor_a, value) + torch_output_tensor = golden_function(value, torch_input_tensor_a) input_tensor_a = ttnn.from_torch( torch_input_tensor_a, @@ -70,8 +70,8 @@ def run( ) start_time = start_measuring_time() - output_tensor = ttnn.pow(value, exponent=input_tensor_a, memory_config=output_memory_config) - output_tensor = ttnn.to_torch(output_tensor) + output_tensor = ttnn.pow(value, input_tensor_a, memory_config=output_memory_config) + output_tensor = ttnn.to_torch(output_tensor, torch_rank=len(torch_input_tensor_a.shape)) e2e_perf = stop_measuring_time(start_time) return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf] diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py index 0152a6d9c5a4..834d9733b471 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_div.py @@ -51,6 +51,7 @@ def test_run_div( if round_mode in ["trunc", "floor"]: pytest.skip("does not work for Grayskull -skipping") if accurate_mode == False: # If input_b is non-zero tensor + pytest.skip("will be enabled after #15780 is resolved") datagen_func = [ generation_funcs.gen_func_with_cast( partial(generation_funcs.gen_rand, low=-1e6, high=1e6), torch.bfloat16 diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py index 65b45a5ba6b3..d95b107a3662 100644 --- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py +++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_fmod.py @@ -24,6 +24,7 @@ ] +@pytest.mark.skip(reason="This test will be enabled after #15780 is resolved") @pytest.mark.parametrize( "input_shapes", [ @@ -44,6 +45,7 @@ def test_run_fmod( dst_mem_config, device, ): + # The ranges need to be retested and updated for respective dtypes in issue #15780 datagen_func = [ generation_funcs.gen_func_with_cast(partial(generation_funcs.gen_rand, low=-100, high=100), torch.bfloat16) ] + [ diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py index 831944b77a0d..7214ba8b74c5 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py @@ -986,8 +986,11 @@ def test_binary_gcd_ttnn(input_shapes, device): ) @skip_for_grayskull("#ToDo: GS implementation needs to be done for remainder") def test_binary_lcm_ttnn(input_shapes, device): - in_data1, input_tensor1 = data_gen_with_range_int(input_shapes, -1024, 1024, device) - in_data2, input_tensor2 = data_gen_with_range_int(input_shapes, -1024, 1024, device) + torch.manual_seed(213919) + in_data1 = torch.randint(-100, 100, input_shapes, dtype=torch.int32) + in_data2 = torch.randint(-80, 180, input_shapes, dtype=torch.int32) + input_tensor1 = ttnn.from_torch(in_data1, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor2 = ttnn.from_torch(in_data2, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) output_tensor = ttnn.lcm(input_tensor1, input_tensor2) golden_function = ttnn.get_golden_function(ttnn.lcm) golden_tensor = golden_function(in_data1, in_data2) diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py new file mode 100644 index 000000000000..c0b569d4cd0a --- /dev/null +++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py @@ -0,0 +1,543 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import ttnn + +import pytest +from models.utility_functions import skip_for_grayskull + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.sub, + ], +) +def test_sub_fp32(device, ttnn_function): + x_torch = torch.tensor([[1]], dtype=torch.float32) + y_torch = torch.tensor([[0.00030171126]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_sub = ttnn.subtract(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_sub) + + status = torch.allclose(z_torch, tt_out, atol=1e-10, rtol=1e-5, equal_nan=False) + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.rsub, + ], +) +def test_rsub_fp32(device, ttnn_function): + x_torch = torch.tensor([[1]], dtype=torch.float32) + y_torch = torch.tensor([[0.00030171126]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_sub = ttnn.rsub(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_sub) + + status = torch.allclose(z_torch, tt_out, atol=1e-10, rtol=1e-5, equal_nan=False) + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.add, + ], +) +def test_add_fp32(device, ttnn_function): + x_torch = torch.tensor([[1]], dtype=torch.float32) + y_torch = torch.tensor([[0.00030171126]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_add = ttnn.add(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_add) + + status = torch.allclose(z_torch, tt_out, atol=1e-10, rtol=1e-5, equal_nan=False) + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.add, + ], +) +def test_add_int32(device, ttnn_function): + x_torch = torch.tensor([[11, 23, 0, -23, -1, -100]], dtype=torch.int32) + y_torch = torch.tensor([[78, 99, 34, -33, -1, 100]], dtype=torch.int32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_add = ttnn.add(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_add) + + status = torch.allclose(z_torch, tt_out, atol=1e-10, rtol=1e-5, equal_nan=False) + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.mul, + ], +) +def test_mul_fp32(device, ttnn_function): + x_torch = torch.tensor([[2]], dtype=torch.float32) + y_torch = torch.tensor([[0.00030171126]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.mul(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = torch.allclose(z_torch, tt_out, atol=1e-10, rtol=1e-5, equal_nan=False) + assert status + + +@pytest.mark.skip(reason="This test will be enabled after #15780 is resolved") +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.div, + ], +) +# Torch num/ 0 = inf and 0/0 nan; TT num/ 0 = inf and 0/0=nan; in fp32 tile +# Torch num/ 0 = inf and 0/0 nan; TT num/ 0 = inf and 0/0=0; in chained (mul * recip) div op +def test_div_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.00030171126, -3, 16, -5, 14, -12, 0, 0, 1]], dtype=torch.float32) + y_torch = torch.tensor([[2, 3, -4, -5, 0, 0, 0, 1, 0]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_div = ttnn.divide(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_div) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.divide, + ], +) +# Torch: num/ 0 = inf and 0/0 nan; +# TT: num/ 0 = inf but 0/0= 0 not nan and 1/0 is not inf; input_b must be non-zero +def test_div_bf16(device, ttnn_function): + x_torch = torch.tensor( + [ + [ + 1.00030171126, + -3, + 16, + -5, + 14, + -12, + 0, + 0, + ] + ], + dtype=torch.bfloat16, + ) + y_torch = torch.tensor( + [ + [ + 2, + 3, + -4, + -5, + 0, + 0, + 0, + 1, + ] + ], + dtype=torch.bfloat16, + ) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_div = ttnn.divide(x_tt, y_tt) # bf16 runs FPU + tt_out = ttnn.to_torch(z_tt_div) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.pow, + ], +) +def test_pow_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.55, 2.25, -3.6]], dtype=torch.float32) + y_torch = torch.tensor([[2, 3, -2.2]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_pow = ttnn.pow(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_pow) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.99 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.add, + ], +) +def test_add_fp32_activ(device, ttnn_function): + x_torch = torch.ones([1, 1, 64, 64], dtype=torch.float32) + y_torch = torch.ones([1, 1, 64, 64], dtype=torch.float32) * 4 + z_torch = torch.square(x_torch + y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_add = ttnn.add(x_tt, y_tt, activations=[ttnn.UnaryWithParam(ttnn.UnaryOpType.POWER, 2)]) + tt_out = ttnn.to_torch(z_tt_add) + + status = torch.allclose(z_torch, tt_out, atol=1e-10, rtol=1e-5, equal_nan=False) + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.add, + ], +) +@pytest.mark.parametrize( + "shape", + [ + [1, 1, 16, 16], + [1, 1, 80, 80], + [1, 1, 320, 384], + [1, 3, 320, 384], + ], +) +def test_add_fp32_input_activ(device, ttnn_function, shape): + x_torch = torch.ones(shape, dtype=torch.float32) * 2 + y_torch = torch.ones(shape, dtype=torch.float32) * 4 + z_torch = torch.square(torch.nn.functional.silu(x_torch) + y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_add = ttnn.add( + x_tt, + y_tt, + activations=[ttnn.UnaryWithParam(ttnn.UnaryOpType.POWER, 2)], + input_tensor_a_activation=ttnn.UnaryOpType.SILU, + ) + tt_out = ttnn.to_torch(z_tt_add) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.9999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.logaddexp, + ], +) +def test_logaddexp_fp32(device, ttnn_function): + x_torch = torch.tensor([[1, 2, 3, 4]], dtype=torch.float32) + y_torch = torch.tensor([[1, 2, 3, 4]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.logaddexp(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.logaddexp2, + ], +) +def test_logaddexp2_fp32(device, ttnn_function): + x_torch = torch.tensor([[1, 2, 3, 4]], dtype=torch.float32) + y_torch = torch.tensor([[2, 3, 4, 5]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.logaddexp2(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.ldexp, + ], +) +def test_ldexp_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.5, 2, 3.33, 4]], dtype=torch.float32) + y_torch = torch.tensor([[2, 3, 4, 5]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.ldexp(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.bias_gelu, + ], +) +def test_bias_gelu_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.5, 2, 3.33, 4]], dtype=torch.float32) + y_torch = torch.tensor([[2, 3, 4, 5]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.bias_gelu(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.squared_difference, + ], +) +def test_squared_difference_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.5, 2, 3.33, 4]], dtype=torch.float32) + y_torch = torch.tensor([[2.009, 3.11, 4.22, 5]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.squared_difference(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.logical_or, + ], +) +def test_logical_or_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.509009, 2, 3.33, 4, 0, -11]], dtype=torch.float32) + y_torch = torch.tensor([[0, 3, 4, 5, 0, -9999]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.logical_or(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.logical_xor, + ], +) +def test_logical_xor_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.509009, 2, 3.33, 4, 0, -11]], dtype=torch.float32) + y_torch = torch.tensor([[0, 3, 4, 5, 0, -9999]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.logical_xor(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.logical_and, + ], +) +def test_logical_and_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.509009, 2, 3.33, 4, 0, -11]], dtype=torch.float32) + y_torch = torch.tensor([[0, 3, 4, 5, 0, -9999]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.logical_and(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.eq, + ttnn.ne, + ttnn.gt, + ttnn.ge, + ttnn.lt, + ttnn.le, + ], +) +def test_relational_fp32(device, ttnn_function): + x_torch = torch.tensor([[1.99999999991, 0, 345.1234568999130, -1]], dtype=torch.float32) + y_torch = torch.tensor([[1.99999999990, 0, 345.1234568999131, -1]], dtype=torch.float32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn_function(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.bitwise_and, + ], +) +def test_bitwise_and(device, ttnn_function): + x_torch = torch.tensor([[1, 2, 3, 4, 5]], dtype=torch.int32) + y_torch = torch.tensor([[9, 3, 0, 1, 7]], dtype=torch.int32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.bitwise_and(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.9999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.bitwise_or, + ], +) +def test_bitwise_or(device, ttnn_function): + x_torch = torch.tensor([[1, 2, 3, 4, 5, 0]], dtype=torch.int32) + y_torch = torch.tensor([[9, 3, 0, 1, 7, 0]], dtype=torch.int32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.bitwise_or(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.9999 + assert status + + +@skip_for_grayskull("Unsupported dtype for Grayskull") +@pytest.mark.parametrize( + "ttnn_function", + [ + ttnn.bitwise_xor, + ], +) +def test_bitwise_xor(device, ttnn_function): + x_torch = torch.tensor([[1, 2, 3, 4, 5, 0]], dtype=torch.int32) + y_torch = torch.tensor([[9, 3, 0, 1, 7, 0]], dtype=torch.int32) + golden_fn = ttnn.get_golden_function(ttnn_function) + z_torch = golden_fn(x_torch, y_torch) + x_tt = ttnn.from_torch(x_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + y_tt = ttnn.from_torch(y_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt = ttnn.from_torch(z_torch, dtype=ttnn.int32, layout=ttnn.TILE_LAYOUT, device=device) + z_tt_out = ttnn.bitwise_xor(x_tt, y_tt) + tt_out = ttnn.to_torch(z_tt_out) + + status = ttnn.pearson_correlation_coefficient(z_torch, tt_out) >= 0.999 + assert status diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_pow.py b/tests/ttnn/unit_tests/operations/eltwise/test_pow.py index 51296079a5ef..547dfeeba73f 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_pow.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_pow.py @@ -6,6 +6,8 @@ import pytest import ttnn from tests.ttnn.unit_tests.operations.eltwise.backward.utility_funcs import data_gen_with_range, compare_pcc +from tests.ttnn.utils_for_testing import assert_with_pcc +from models.utility_functions import skip_for_grayskull @pytest.mark.parametrize( @@ -28,3 +30,130 @@ def test_unary_pow_ttnn(input_shapes, exponent, device): comp_pass = compare_pcc([output_tensor], [golden_tensor], pcc=0.9) assert comp_pass + + +@skip_for_grayskull() +@pytest.mark.parametrize( + "input_shapes", + ([20, 20], [2, 32, 320], [1, 1, 32, 32], [1, 3, 320, 384], [1, 2, 32, 64, 64]), +) +@pytest.mark.parametrize("input", [10.0, 5.5, -5.0, -2.5, -10, -3, 9.5, -7.25, -6.15]) +@pytest.mark.parametrize("exponent", [2.75, 2.5, 1.5, 4, 5.75, 0, -1.5, -2.25, -3, -4.25, -5.5]) +# Both input and exponent are -ve and exponent is a non-integer, TT and Torch output = nan +# input = non-zero and exponent = 0, TT and Torch output = 1 +# Both input and exponent are 0, TT = 1 and Torch output = 0 +def test_binary_pow_scalar_input(input_shapes, input, exponent, device): + torch_input_tensor_b = torch.full(input_shapes, exponent, dtype=torch.float32) + torch_output_tensor = torch.pow(input, torch_input_tensor_b) + + golden_fn = ttnn.get_golden_function(ttnn.pow) + golden_tensor = golden_fn(input, torch_input_tensor_b) + + cq_id = 0 + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + output = ttnn.pow(input, input_tensor_b, queue_id=cq_id) + output = ttnn.to_torch(output) + + assert_with_pcc(torch_output_tensor, output, 0.999) + + +def generate_torch_tensor(shape, low, high, step=0.0025, dtype=torch.float32): + num_elements = torch.prod(torch.tensor(shape)) + values = torch.arange(low, high + step, step, dtype=dtype) + + if values.numel() < num_elements: + values = values.repeat((num_elements // values.numel()) + 1) + values = values[:num_elements] + return values.reshape(shape) + + +@skip_for_grayskull() +@pytest.mark.parametrize( + "input_shapes", + [[64, 640], [2, 32, 320], [2, 1, 32, 1024], [1, 1, 32, 32], [1, 3, 320, 384], [1, 2, 32, 64, 128]], +) +def test_binary_sfpu_pow( + device, + input_shapes, +): + torch_input_tensor_a = generate_torch_tensor(input_shapes, -30, 30, step=0.0022) + torch_input_tensor_b = generate_torch_tensor(input_shapes, -20, 20) + torch_output_tensor = torch.pow(torch_input_tensor_a, torch_input_tensor_b) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + + output = ttnn.pow(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + pcc = ttnn.pearson_correlation_coefficient(torch_output_tensor, output) + assert pcc >= 0.99 + + +@skip_for_grayskull() +@pytest.mark.parametrize( + "input_shapes", + [[64, 640], [2, 32, 320], [2, 1, 1024, 1024], [1, 1, 32, 32], [1, 3, 320, 384], [1, 2, 32, 64, 64]], +) +def test_binary_sfpu_pow_bf16( + device, + input_shapes, +): + torch_input_tensor_a = generate_torch_tensor(input_shapes, -30, 30, step=0.0021, dtype=torch.bfloat16) + torch_input_tensor_b = generate_torch_tensor(input_shapes, -20, 20, dtype=torch.bfloat16) + torch_output_tensor = torch.pow(torch_input_tensor_a, torch_input_tensor_b) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device) + + output = ttnn.pow(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + pcc = ttnn.pearson_correlation_coefficient(torch_output_tensor, output) + assert pcc >= 0.99 + + +@skip_for_grayskull() +@pytest.mark.parametrize( + "input_shapes", + [[2, 1, 32, 1024], [1, 3, 320, 384], [1, 2, 32, 64, 128]], +) +def test_binary_sfpu_pow_pos( + device, + input_shapes, +): + torch_input_tensor_a = generate_torch_tensor(input_shapes, 0, 30, step=0.0111) + torch_input_tensor_b = generate_torch_tensor(input_shapes, -20, 20) + torch_output_tensor = torch.pow(torch_input_tensor_a, torch_input_tensor_b) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + + output = ttnn.pow(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + pcc = ttnn.pearson_correlation_coefficient(torch_output_tensor, output) + assert pcc >= 0.99 + + +@skip_for_grayskull() +@pytest.mark.parametrize( + "input_shapes", + [[2, 1, 32, 1024], [1, 3, 320, 384], [1, 2, 32, 64, 128]], +) +def test_binary_sfpu_pow_neg( + device, + input_shapes, +): + torch_input_tensor_a = generate_torch_tensor(input_shapes, -30, 0, step=0.0111) + torch_input_tensor_b = generate_torch_tensor(input_shapes, 0, 10) + torch_output_tensor = torch.pow(torch_input_tensor_a, torch_input_tensor_b) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device) + + output = ttnn.pow(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + pcc = ttnn.pearson_correlation_coefficient(torch_output_tensor, output) + assert pcc >= 0.99 diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_sub.py b/tests/ttnn/unit_tests/operations/eltwise/test_sub.py index dc2c47c9070c..2539c56c7ccd 100644 --- a/tests/ttnn/unit_tests/operations/eltwise/test_sub.py +++ b/tests/ttnn/unit_tests/operations/eltwise/test_sub.py @@ -72,6 +72,21 @@ def test_sub(device, h, w): assert_with_pcc(torch_output_tensor, output, 0.9999) +@pytest.mark.parametrize("h", [32]) +@pytest.mark.parametrize("w", [64]) +def test_rsub(device, h, w): + torch_input_tensor_a = torch.rand((h, w), dtype=torch.bfloat16) + torch_input_tensor_b = torch.rand((h, w), dtype=torch.bfloat16) + torch_output_tensor = torch.sub(torch_input_tensor_b, torch_input_tensor_a) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device) + output = ttnn.rsub(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + assert_with_pcc(torch_output_tensor, output, 0.9999) + + @pytest.mark.parametrize("n", [2]) @pytest.mark.parametrize("c", [3]) @pytest.mark.parametrize("h", [128]) @@ -88,3 +103,21 @@ def test_sub_4D(device, n, c, h, w): output = ttnn.to_torch(output) assert_with_pcc(torch_output_tensor, output, 0.9999) + + +@pytest.mark.parametrize("n", [2]) +@pytest.mark.parametrize("c", [3]) +@pytest.mark.parametrize("h", [128]) +@pytest.mark.parametrize("w", [128]) +def test_rsub_4D(device, n, c, h, w): + torch_input_tensor_a = torch.rand((n, c, h, w), dtype=torch.bfloat16) + torch_input_tensor_b = torch.rand((n, c, h, w), dtype=torch.bfloat16) + torch_output_tensor = torch.sub(torch_input_tensor_b, torch_input_tensor_a) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device) + + output = ttnn.rsub(input_tensor_a, input_tensor_b) + output = ttnn.to_torch(output) + + assert_with_pcc(torch_output_tensor, output, 0.9999) diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 42dbd511632a..0143fd2d24c1 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -129,6 +129,7 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/eltwise/binary_backward/binary_backward.cpp diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp index 6f84ac75e1d5..7ed428e147c9 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp @@ -176,9 +176,9 @@ Tensor BinaryOperation::invoke( const Tensor& input_tensor_b_arg, const std::optional& output_dtype, const std::optional& memory_config, - std::optional optional_output_tensor, - std::optional activations, - std::optional input_tensor_a_activation) { + const std::optional& optional_output_tensor, + const std::optional& activations, + const std::optional& input_tensor_a_activation) { return invoke( DefaultQueueId, input_tensor_a_arg, @@ -196,10 +196,10 @@ Tensor BinaryOperation::invoke( const ttnn::Tensor& input_tensor_a, float scalar, const std::optional& output_dtype, - const std::optional& memory_config, + const std::optional& memory_config, const std::optional& optional_output_tensor, - std::optional activations, - std::optional input_tensor_a_activation) { + const std::optional& activations, + const std::optional& input_tensor_a_activation) { return ttnn::prim::binary( queue_id, input_tensor_a, @@ -219,10 +219,10 @@ Tensor BinaryOperation::invoke( const ttnn::Tensor& input_tensor_a, float scalar, const std::optional& output_dtype, - const std::optional& memory_config, + const std::optional& memory_config, const std::optional& optional_output_tensor, - std::optional activations, - std::optional input_tensor_a_activation) { + const std::optional& activations, + const std::optional& input_tensor_a_activation) { return BinaryOperation::invoke( DefaultQueueId, input_tensor_a, @@ -241,7 +241,7 @@ Tensor RelationalBinary::invoke( const Tensor& input_tensor_b_arg, const std::optional& output_dtype, const std::optional& memory_config, - std::optional optional_output_tensor, + const std::optional& optional_output_tensor, const std::optional& activations, const std::optional& input_tensor_a_activation) { if (output_dtype.has_value() && optional_output_tensor.has_value()) { @@ -265,7 +265,7 @@ Tensor RelationalBinary::invoke( input_tensor_b, binary_op_type, dtype, - output_memory_config, + memory_config, optional_output_tensor, activations, input_tensor_a_activation); @@ -277,9 +277,9 @@ Tensor RelationalBinary::invoke( const Tensor& input_tensor_b_arg, const std::optional& output_dtype, const std::optional& memory_config, - std::optional optional_output_tensor, - std::optional activations, - std::optional input_tensor_a_activation) { + const std::optional& optional_output_tensor, + const std::optional& activations, + const std::optional& input_tensor_a_activation) { return invoke( DefaultQueueId, input_tensor_a_arg, @@ -366,8 +366,8 @@ template Tensor InplaceBinaryOperation::invoke( const Tensor& input_tensor_a_arg, const Tensor& input_tensor_b_arg, - std::optional activations, - std::optional input_tensor_a_activation) { + const std::optional& activations, + const std::optional& input_tensor_a_activation) { return BinaryOperation::invoke( input_tensor_a_arg, input_tensor_b_arg, @@ -382,12 +382,63 @@ template Tensor InplaceBinaryOperation::invoke( const ttnn::Tensor& input_tensor_a, const float scalar, - std::optional activations, - std::optional input_tensor_a_activation) { + const std::optional& activations, + const std::optional& input_tensor_a_activation) { return BinaryOperation::invoke( input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, activations, input_tensor_a_activation); } +template +Tensor BinaryOperationSfpu::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& output_dtype, + const std::optional& memory_config, + const std::optional& optional_output_tensor, + const std::optional& activations, + const std::optional& input_tensor_a_activation) { + auto [input_tensor_a, input_tensor_b] = + detail::preprocess_inputs(input_tensor_a_arg, input_tensor_b_arg); + + auto output_memory_config = memory_config.value_or(input_tensor_a.memory_config()); + DataType dtype = output_dtype.value_or(input_tensor_a.get_dtype()); + if (optional_output_tensor.has_value()) { + dtype = optional_output_tensor.value().get_dtype(); + } + + return ttnn::prim::binary( + queue_id, + input_tensor_a, + input_tensor_b, + binary_op_type, + output_dtype, + memory_config, + optional_output_tensor, + activations, + input_tensor_a_activation); +} + +template +Tensor BinaryOperationSfpu::invoke( + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& output_dtype, + const std::optional& memory_config, + const std::optional& optional_output_tensor, + const std::optional& activations, + const std::optional& input_tensor_a_activation) { + return invoke( + DefaultQueueId, + input_tensor_a_arg, + input_tensor_b_arg, + output_dtype, + memory_config, + optional_output_tensor, + activations, + input_tensor_a_activation); +} + template struct BinaryOperation; template struct InplaceBinaryOperation; template struct BinaryOperation; @@ -403,6 +454,7 @@ template struct BinaryOperation; template struct BinaryOperation; template struct BinaryOperation; template struct BinaryOperation; +template struct BinaryOperation; template struct RelationalBinary; template struct RelationalBinary; @@ -422,4 +474,9 @@ template struct InplaceLogicalBinary; template struct InplaceLogicalBinary; template struct InplaceLogicalBinary; +template struct BinaryOperationSfpu; +template struct BinaryOperationSfpu; +template struct BinaryOperationSfpu; +template struct BinaryOperationSfpu; + } // namespace ttnn::operations::binary diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp index 1abf49af3207..67e00b2a3115 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp @@ -35,9 +35,9 @@ struct BinaryOperation { const Tensor& input_tensor_b_arg, const std::optional& output_dtype = std::nullopt, const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt, - std::optional activations = std::nullopt, - std::optional input_tensor_a_activation = std::nullopt); + const std::optional& optional_output_tensor = std::nullopt, + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); static Tensor invoke( uint8_t queue_id, @@ -46,8 +46,8 @@ struct BinaryOperation { const std::optional& output_dtype = std::nullopt, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt, - std::optional activations = std::nullopt, - std::optional input_tensor_a_activation = std::nullopt); + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); static Tensor invoke( const ttnn::Tensor& input_tensor_a, @@ -55,8 +55,8 @@ struct BinaryOperation { const std::optional& output_dtype = std::nullopt, const std::optional& memory_config = std::nullopt, const std::optional& optional_output_tensor = std::nullopt, - std::optional activations = std::nullopt, - std::optional input_tensor_a_activation = std::nullopt); + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); }; template @@ -67,7 +67,7 @@ struct RelationalBinary { const Tensor& input_tensor_b_arg, const std::optional& output_dtype = std::nullopt, const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt, const std::optional& activations = std::nullopt, const std::optional& input_tensor_a_activation = std::nullopt); @@ -76,9 +76,9 @@ struct RelationalBinary { const Tensor& input_tensor_b_arg, const std::optional& output_dtype = std::nullopt, const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt, - std::optional activations = std::nullopt, - std::optional input_tensor_a_activation = std::nullopt); + const std::optional& optional_output_tensor = std::nullopt, + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); static Tensor invoke( const ttnn::Tensor& input_tensor_a, @@ -126,14 +126,36 @@ struct InplaceBinaryOperation { static Tensor invoke( const Tensor& input_tensor_a, const Tensor& input_tensor_b, - std::optional activations = std::nullopt, - std::optional input_tensor_a_activation = std::nullopt); + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); static Tensor invoke( const Tensor& input_tensor, const float scalar, - std::optional activations = std::nullopt, - std::optional input_tensor_a_activation = std::nullopt); + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); +}; + +template +struct BinaryOperationSfpu { + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& output_dtype = std::nullopt, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt, + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& output_dtype = std::nullopt, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt, + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); }; } // namespace binary @@ -227,6 +249,22 @@ constexpr auto ne_ = ttnn::register_operation_with_auto_launch_op< "ttnn::ne_", operations::binary::InplaceRelationalBinary>(); +constexpr auto rsub_binary = ttnn::register_operation_with_auto_launch_op< + "ttnn::rsub_binary", + operations::binary::BinaryOperation>(); +constexpr auto power_binary = ttnn::register_operation_with_auto_launch_op< + "ttnn::power_binary", + operations::binary::BinaryOperationSfpu>(); +constexpr auto bitwise_and_binary = ttnn::register_operation_with_auto_launch_op< + "ttnn::bitwise_and_binary", + operations::binary::BinaryOperationSfpu>(); +constexpr auto bitwise_or_binary = ttnn::register_operation_with_auto_launch_op< + "ttnn::bitwise_or_binary", + operations::binary::BinaryOperationSfpu>(); +constexpr auto bitwise_xor_binary = ttnn::register_operation_with_auto_launch_op< + "ttnn::bitwise_xor_binary", + operations::binary::BinaryOperationSfpu>(); + template ttnn::Tensor operator+(const ttnn::Tensor& input_tensor_a, InputBType scalar) { return add(input_tensor_a, scalar); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp index 7a35ccbc0655..f366c1104f1a 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp @@ -260,6 +260,124 @@ struct ExecutePrelu { const Tensor& input_tensor, float scalar, const std::optional& memory_config = std::nullopt); }; +struct ExecuteRsub { + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& output_dtype = std::nullopt, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt, + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& output_dtype = std::nullopt, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt, + const std::optional& activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); + + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor, + float input_b, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor, + float input_b, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); +}; + +struct ExecuteBitwiseAnd { + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor, + int32_t input_b, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor, + int32_t input_b, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); +}; + +struct ExecuteBitwiseOr { + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor, + int32_t input_b, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor, + int32_t input_b, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); +}; + +struct ExecuteBitwiseXor { + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor_a_arg, + const Tensor& input_tensor_b_arg, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor, + int32_t input_b, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor, + int32_t input_b, + const std::optional& memory_config = std::nullopt, + const std::optional& optional_output_tensor = std::nullopt); +}; + } // namespace binary } // namespace operations @@ -314,5 +432,9 @@ constexpr auto polyval = ttnn::register_operation_with_auto_launch_op< constexpr auto gcd = ttnn::register_operation_with_auto_launch_op<"ttnn::gcd", operations::binary::ExecuteGCD>(); constexpr auto lcm = ttnn::register_operation_with_auto_launch_op<"ttnn::lcm", operations::binary::ExecuteLCM>(); constexpr auto prelu = ttnn::register_operation_with_auto_launch_op<"ttnn::prelu", operations::binary::ExecutePrelu>(); +constexpr auto rsub = ttnn::register_operation_with_auto_launch_op<"ttnn::rsub", operations::binary::ExecuteRsub>(); +constexpr auto bitwise_and = ttnn::register_operation_with_auto_launch_op<"ttnn::bitwise_and", operations::binary::ExecuteBitwiseAnd>(); +constexpr auto bitwise_or = ttnn::register_operation_with_auto_launch_op<"ttnn::bitwise_or", operations::binary::ExecuteBitwiseOr>(); +constexpr auto bitwise_xor = ttnn::register_operation_with_auto_launch_op<"ttnn::bitwise_xor", operations::binary::ExecuteBitwiseXor>(); } // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp index 0d73d503470e..d244c63a83fe 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp @@ -213,6 +213,219 @@ void bind_binary_operation( py::arg("queue_id") = 0}); } +template +void bind_binary_unary_operation( + py::module& module, + const binary_operation_t& operation, + const std::string& description, + const std::string& math, + const std::string& info = ". ", + const std::string& supported_dtype = "BFLOAT16", + const std::string& note = " ") { + auto doc = fmt::format( + R"doc( + {2} + + .. math:: + {3} + + Args: + input_tensor_a (ttnn.Tensor): the input tensor. + input_tensor_b (ttnn.Tensor or Number): the input tensor. + + Keyword args: + memory_config (ttnn.MemoryConfig, optional): memory configuration for the operation. Defaults to `None`. + dtype (ttnn.DataType, optional): data type for the output tensor. Defaults to `None`. + output_tensor (ttnn.Tensor, optional): preallocated output tensor. Defaults to `None`. + activations (List[str], optional): list of activation functions to apply to the output tensor{4}Defaults to `None`. + queue_id (int, optional): command queue id. Defaults to `0`. + + Returns: + ttnn.Tensor: the output tensor. + + + Note: + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {5} + - TILE + - 2, 3, 4 + + {6} + + Example: + >>> tensor1 = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), layout=ttnn.TILE_LAYOUT, device=device) + >>> tensor2 = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), layout=ttnn.TILE_LAYOUT, device=device) + >>> output = {1}(tensor1, tensor2/scalar) + )doc", + operation.base_name(), + operation.python_fully_qualified_name(), + description, + math, + info, + supported_dtype, + note); + + bind_registered_operation( + module, + operation, + doc, + // tensor and scalar + ttnn::pybind_overload_t{ + [](const binary_operation_t& self, + const ttnn::Tensor& input_tensor_a, + const float scalar, + const std::optional& memory_config, + const std::optional& output_tensor, + const uint8_t& queue_id) -> ttnn::Tensor { + return self(queue_id, input_tensor_a, scalar, memory_config, output_tensor); + }, + py::arg("input_tensor_a"), + py::arg("input_b"), + py::kw_only(), + py::arg("memory_config") = std::nullopt, + py::arg("output_tensor") = std::nullopt, + py::arg("queue_id") = 0}, + + // tensor and tensor + ttnn::pybind_overload_t{ + [](const binary_operation_t& self, + const ttnn::Tensor& input_tensor_a, + const ttnn::Tensor& input_tensor_b, + const std::optional& dtype, + const std::optional& memory_config, + const std::optional& output_tensor, + const std::optional& activations, + const std::optional& input_tensor_a_activation, + uint8_t queue_id) -> ttnn::Tensor { + return self( + queue_id, + input_tensor_a, + input_tensor_b, + dtype, + memory_config, + output_tensor, + activations, + input_tensor_a_activation); + }, + py::arg("input_tensor_a"), + py::arg("input_tensor_b"), + py::kw_only(), + py::arg("dtype") = std::nullopt, + py::arg("memory_config") = std::nullopt, + py::arg("output_tensor") = std::nullopt, + py::arg("activations") = std::nullopt, + py::arg("input_tensor_a_activation") = std::nullopt, + py::arg("queue_id") = 0}); +} + +template +void bind_bitwise_binary_ops_operation( + py::module& module, + const binary_operation_t& operation, + const std::string& description, + const std::string& math, + const std::string& info = ". ", + const std::string& supported_dtype = "BFLOAT16", + const std::string& note = " ") { + auto doc = fmt::format( + R"doc( + {2} + + .. math:: + {3} + + Args: + input_tensor_a (ttnn.Tensor): the input tensor. + input_tensor_b (ttnn.Tensor or Integer): the input tensor. + + Keyword args: + memory_config (ttnn.MemoryConfig, optional): memory configuration for the operation. Defaults to `None`. + output_tensor (ttnn.Tensor, optional): preallocated output tensor. Defaults to `None`. + queue_id (int, optional): command queue id. Defaults to `0`. + + Returns: + ttnn.Tensor: the output tensor. + + + Note: + Supported dtypes, layouts, and ranks: + + .. list-table:: + :header-rows: 1 + + * - Dtypes + - Layouts + - Ranks + * - {5} + - TILE + - 2, 3, 4 + + {6} + + Example: + >>> tensor1 = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), layout=ttnn.TILE_LAYOUT, device=device) + >>> tensor2 = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), layout=ttnn.TILE_LAYOUT, device=device) + >>> output = {1}(tensor1, tensor2/scalar) + )doc", + operation.base_name(), + operation.python_fully_qualified_name(), + description, + math, + info, + supported_dtype, + note); + + bind_registered_operation( + module, + operation, + doc, + // tensor and scalar + ttnn::pybind_overload_t{ + [](const binary_operation_t& self, + const ttnn::Tensor& input_tensor_a, + const int32_t scalar, + const std::optional& memory_config, + const std::optional& output_tensor, + const uint8_t& queue_id) -> ttnn::Tensor { + return self(queue_id, input_tensor_a, scalar, memory_config, output_tensor); + }, + py::arg("input_tensor_a"), + py::arg("input_b"), + py::kw_only(), + py::arg("memory_config") = std::nullopt, + py::arg("output_tensor") = std::nullopt, + py::arg("queue_id") = 0}, + + // tensor and tensor + ttnn::pybind_overload_t{ + [](const binary_operation_t& self, + const ttnn::Tensor& input_tensor_a, + const ttnn::Tensor& input_tensor_b, + const std::optional& memory_config, + const std::optional& output_tensor, + uint8_t queue_id) -> ttnn::Tensor { + return self( + queue_id, + input_tensor_a, + input_tensor_b, + memory_config, + output_tensor); + }, + py::arg("input_tensor_a"), + py::arg("input_tensor_b"), + py::kw_only(), + py::arg("memory_config") = std::nullopt, + py::arg("output_tensor") = std::nullopt, + py::arg("queue_id") = 0}); +} + template void bind_binary_composite( py::module& module, @@ -1209,6 +1422,38 @@ void py_module(py::module& module) { R"doc(Divides :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", R"doc(\mathrm{{output\_tensor}}_i = (\mathrm{{input\_tensor\_a}}_i / \mathrm{{input\_tensor\_b}}_i))doc"); + detail::bind_binary_unary_operation( + module, + ttnn::rsub, + R"doc(Subtracts :attr:`input_tensor_a` from :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", + R"doc(\mathrm{{output\_tensor}}_i = \mathrm{{input\_tensor\_b}}_i - \mathrm{{input\_tensor\_a}}_i)doc", + ". ", + R"doc(BFLOAT16, BFLOAT8_B)doc"); + + detail::bind_bitwise_binary_ops_operation( + module, + ttnn::bitwise_and, + R"doc(Perform bitwise_and operation on :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", + R"doc(\mathrm{{output\_tensor}}_i = \mathrm{{input\_tensor\_b}}_i \verb|bitwise_and| \mathrm{{input\_tensor\_a}}_i)doc", + ". ", + R"doc(INT32)doc"); + + detail::bind_bitwise_binary_ops_operation( + module, + ttnn::bitwise_or, + R"doc(Perform bitwise_or operation on :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", + R"doc(\mathrm{{output\_tensor}}_i = \mathrm{{input\_tensor\_b}}_i \verb|bitwise_or| \mathrm{{input\_tensor\_a}}_i)doc", + ". ", + R"doc(INT32)doc"); + + detail::bind_bitwise_binary_ops_operation( + module, + ttnn::bitwise_xor, + R"doc(Perform bitwise_xor operation on :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc", + R"doc(\mathrm{{output\_tensor}}_i = \mathrm{{input\_tensor\_b}}_i \verb|bitwise_xor| \mathrm{{input\_tensor\_a}}_i)doc", + ". ", + R"doc(INT32)doc"); + auto prim_module = module.def_submodule("prim", "Primitive binary operations"); detail::bind_primitive_binary_operation( diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_types.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_types.hpp index db1a36836aed..35c306ad12cb 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_types.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_types.hpp @@ -24,6 +24,11 @@ enum class BinaryOpType { LOGICAL_XOR, LDEXP, LOGADDEXP2, - DIV_FAST + DIV_FAST, + RSUB, + POWER, + BITWISE_XOR, + BITWISE_AND, + BITWISE_OR }; } diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp index da7b72cda757..644baf5aec3a 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.cpp @@ -67,6 +67,12 @@ std::map get_defines( op_binary_type = "EltwiseBinaryType::ELWADD"; defines.merge(get_defines(UnaryOpType::LOG, std::nullopt, "0", idst)); break; + case BinaryOpType::RSUB: + // rsub(a,b) = b - a + defines.merge(get_defines(UnaryOpType::NEG, std::nullopt, "PRE_IN0_0")); + op_name = "add_tiles"; + op_binary_type = "EltwiseBinaryType::ELWADD"; + break; case BinaryOpType::DIV_FAST: // Divide by a non-zero tensor defines.merge(get_defines(UnaryOpType::RECIP, std::nullopt, "PRE_IN1_0")); @@ -159,4 +165,128 @@ std::map get_defines( return defines; } +std::map get_defines_fp32( + BinaryOpType op_type, + const std::optional input_a_dtype, + const std::optional input_b_dtype, + const std::optional>& fused_activations, + const std::optional& input_tensor_a_activation) { + std::map new_defines; + std::string op_name = "sub_binary_tile"; + std::string idst1 = "i*2"; // tile index for input A in dst and final output + std::string idst2 = "i*2+1"; // tile index for input B in dst + std::string idst = "i"; // tile index for input prescaling + + using ttnn::operations::unary::utils::get_defines; + switch (op_type) { + case BinaryOpType::ADD: + if (input_a_dtype == DataType::INT32 && input_b_dtype == DataType::INT32) { + new_defines.insert({"ADD_INT32_INIT", fmt::format("add_int32_tile_init();")}); + op_name = "add_int32_tile"; + } else { + op_name = "add_binary_tile"; + } + break; + case BinaryOpType::SUB: op_name = "sub_binary_tile"; break; + case BinaryOpType::MUL: op_name = "mul_binary_tile"; break; + case BinaryOpType::RSUB: op_name = "rsub_binary_tile"; break; + case BinaryOpType::POWER: op_name = "power_binary_tile"; break; + case BinaryOpType::DIV_FAST: op_name = "div_binary_tile"; break; + case BinaryOpType::BITWISE_AND: + new_defines.insert({"BITWISE_INIT", fmt::format("binary_bitwise_tile_init();")}); + op_name = "and_binary_tile"; + break; + case BinaryOpType::BITWISE_OR: + new_defines.insert({"BITWISE_INIT", fmt::format("binary_bitwise_tile_init();")}); + op_name = "or_binary_tile"; + break; + case BinaryOpType::BITWISE_XOR: + new_defines.insert({"BITWISE_INIT", fmt::format("binary_bitwise_tile_init();")}); + op_name = "xor_binary_tile"; + break; + case BinaryOpType::LOGADDEXP: + // PRE_IN0_0 ===> Applies prescaling for first input + // PRE_IN1_0 ====> Applies prescaling for second input + new_defines.merge(get_defines(UnaryOpType::EXP, std::vector{0}, "PRE_IN0_0")); + new_defines.merge(get_defines(UnaryOpType::EXP, std::vector{0}, "PRE_IN1_0")); + op_name = "add_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::LOG, std::nullopt, "0", idst1)); + break; + case BinaryOpType::LOGADDEXP2: + new_defines.merge(get_defines(UnaryOpType::EXP2, std::nullopt, "PRE_IN0_0")); + new_defines.merge(get_defines(UnaryOpType::EXP2, std::nullopt, "PRE_IN1_0")); + op_name = "add_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::LOG2, std::nullopt, "0", idst1)); + break; + case BinaryOpType::LDEXP: + new_defines.merge(get_defines(UnaryOpType::EXP2, std::nullopt, "PRE_IN1_0")); + op_name = "mul_binary_tile"; + break; + case BinaryOpType::SQUARED_DIFFERENCE: + op_name = "sub_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::SQUARE, std::nullopt, "0", idst1)); + break; + case BinaryOpType::LOGICAL_AND: + op_name = "mul_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::NEZ, std::nullopt, "0", idst1)); + break; + case BinaryOpType::BIAS_GELU: + op_name = "add_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::GELU, std::vector{0}, "0", idst1)); + break; + case BinaryOpType::LOGICAL_OR: + new_defines.merge(get_defines(UnaryOpType::NEZ, std::nullopt, "PRE_IN0_0")); + new_defines.merge(get_defines(UnaryOpType::NEZ, std::nullopt, "PRE_IN1_0")); + op_name = "add_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::GTZ, std::nullopt, "0", idst1)); + break; + case BinaryOpType::LOGICAL_XOR: + new_defines.merge(get_defines(UnaryOpType::NEZ, std::nullopt, "PRE_IN0_0")); + new_defines.merge(get_defines(UnaryOpType::NEZ, std::nullopt, "PRE_IN1_0")); + op_name = "sub_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::NEZ, std::nullopt, "0", idst1)); + break; + // applied on A-B + case BinaryOpType::GT: + op_name = "sub_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::GTZ, std::nullopt, "0", idst1)); break; + case BinaryOpType::LT: + op_name = "sub_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::LTZ, std::nullopt, "0", idst1)); break; + case BinaryOpType::GTE: + op_name = "sub_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::GEZ, std::nullopt, "0", idst1)); break; + case BinaryOpType::LTE: + op_name = "sub_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::LEZ, std::nullopt, "0", idst1)); break; + case BinaryOpType::EQ: + op_name = "sub_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::EQZ, std::nullopt, "0", idst1)); break; + case BinaryOpType::NE: + op_name = "sub_binary_tile"; + new_defines.merge(get_defines(UnaryOpType::NEZ, std::nullopt, "0", idst1)); break; + default: + tt::log_debug(tt::LogOp, "Undefined op type {}", op_type); + TT_FATAL(false, "Undefined op type for binary sfpu operation {}", op_type); + } + + new_defines.insert({"BINARY_SFPU_OP", fmt::format("{}({}, {});", op_name, idst1, idst2)}); + + if (fused_activations.has_value()) { + if (op_type == BinaryOpType::ADD and fused_activations.value().size() == 1 and + fused_activations.value().at(0).op_type == UnaryOpType::RELU) { + new_defines["PACK_RELU"] = "1"; + } else { + new_defines.merge(ttnn::operations::unary::utils::get_block_defines(fused_activations.value(), "0", idst1)); + } + } + + if (input_tensor_a_activation.has_value()) { + new_defines.merge(ttnn::operations::unary::utils::get_defines( + input_tensor_a_activation.value().op_type, std::nullopt, "PRE_IN0_0", idst)); + } + + return new_defines; +} + } // namespace ttnn::operations::binary::utils diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.hpp index be41ea68bd35..cb4f598fcf68 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/common/binary_op_utils.hpp @@ -22,4 +22,10 @@ std::map get_defines( const std::optional& fused_activations = std::nullopt, const std::optional& input_tensor_a_activation = std::nullopt); +std::map get_defines_fp32( + BinaryOpType op_type, + const std::optional in_a_dtype = std::nullopt, + const std::optional in_b_dtype = std::nullopt, + const std::optional& fused_activations = std::nullopt, + const std::optional& input_tensor_a_activation = std::nullopt); } diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp index ecbd4efc6278..f09d2b08e8ac 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp @@ -131,7 +131,7 @@ Tensor ExecuteMaximum::invoke( Tensor ExecuteMaximum::invoke( const Tensor& input_a, float value, const std::optional& output_mem_config) { - Tensor t_diff = ttnn::rsub(input_a, value, output_mem_config); + Tensor t_diff = ttnn::rsub_unary(input_a, value, output_mem_config); Tensor result = ttnn::where(t_diff, value, input_a); return result; } @@ -361,7 +361,7 @@ Tensor ExecuteBinaryRemainder::invoke( output_mem_config); result = ttnn::where(ttnn::ge(result, b), ttnn::subtract(result, b), result); result = ttnn::where(ttnn::ltz(b), ttnn::add(result, b), result); - result = ttnn::where(ttnn::eq(a, b, std::nullopt, output_mem_config), 0.0f, result); + result = ttnn::where(ttnn::eq(input_a, input_b, std::nullopt, output_mem_config), 0.0f, result); return typecast(result, input_dtype); } @@ -382,7 +382,7 @@ Tensor ExecuteBinaryFmod::invoke( Tensor div_res = typecast(ttnn::div(input_a, input_b, true, "trunc", output_mem_config), DataType::FLOAT32); Tensor result = ttnn::subtract(a, ttnn::multiply(div_res, b, std::nullopt, output_mem_config), std::nullopt, output_mem_config); - result = ttnn::where(ttnn::eq(a, b, std::nullopt, output_mem_config), 0.0f, result); + result = ttnn::where(ttnn::eq(input_a, input_b, std::nullopt, output_mem_config), 0.0f, result); return typecast(result, input_dtype); } @@ -526,4 +526,254 @@ Tensor ExecuteLCM::invoke( return ttnn::abs(result); } +Tensor ExecuteRsub::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a, + const Tensor& input_tensor_b, + const std::optional& output_dtype, + const std::optional& memory_config, + const std::optional& optional_output_tensor, + const std::optional& activations, + const std::optional& input_tensor_a_activation) { + + return ttnn::rsub_binary( + queue_id, + input_tensor_a, + input_tensor_b, + output_dtype, + memory_config, + optional_output_tensor, + activations, + input_tensor_a_activation); +} + +Tensor ExecuteRsub::invoke( + const Tensor& input_tensor_a, + const Tensor& input_tensor_b, + const std::optional& output_dtype, + const std::optional& memory_config, + const std::optional& optional_output_tensor, + const std::optional& activations, + const std::optional& input_tensor_a_activation) { + + return ExecuteRsub::invoke( + ttnn::DefaultQueueId, + input_tensor_a, + input_tensor_b, + output_dtype, + memory_config, + optional_output_tensor, + activations, + input_tensor_a_activation); +} + +Tensor ExecuteRsub::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a, + const float input_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ttnn::rsub_unary( + queue_id, + input_tensor_a, + input_b, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteRsub::invoke( + const Tensor& input_tensor_a, + const float input_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ExecuteRsub::invoke( + ttnn::DefaultQueueId, + input_tensor_a, + input_b, + memory_config, + std::move(optional_output_tensor)); +} + +// Bitwise AND +Tensor ExecuteBitwiseAnd::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a, + const Tensor& input_tensor_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ttnn::bitwise_and_binary( + queue_id, + input_tensor_a, + input_tensor_b, + std::nullopt, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseAnd::invoke( + const Tensor& input_tensor_a, + const Tensor& input_tensor_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ExecuteBitwiseAnd::invoke( + ttnn::DefaultQueueId, + input_tensor_a, + input_tensor_b, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseAnd::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a, + const int32_t input_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ttnn::bitwise_and_unary( + queue_id, + input_tensor_a, + input_b, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseAnd::invoke( + const Tensor& input_tensor_a, + const int32_t input_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ExecuteBitwiseAnd::invoke( + ttnn::DefaultQueueId, + input_tensor_a, + input_b, + memory_config, + std::move(optional_output_tensor)); +} + +// Bitwise OR +Tensor ExecuteBitwiseOr::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a, + const Tensor& input_tensor_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ttnn::bitwise_or_binary( + queue_id, + input_tensor_a, + input_tensor_b, + std::nullopt, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseOr::invoke( + const Tensor& input_tensor_a, + const Tensor& input_tensor_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ExecuteBitwiseOr::invoke( + ttnn::DefaultQueueId, + input_tensor_a, + input_tensor_b, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseOr::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a, + const int32_t input_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ttnn::bitwise_or_unary( + queue_id, + input_tensor_a, + input_b, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseOr::invoke( + const Tensor& input_tensor_a, + const int32_t input_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ExecuteBitwiseOr::invoke( + ttnn::DefaultQueueId, + input_tensor_a, + input_b, + memory_config, + std::move(optional_output_tensor)); +} + +// Bitwise XOR +Tensor ExecuteBitwiseXor::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a, + const Tensor& input_tensor_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ttnn::bitwise_xor_binary( + queue_id, + input_tensor_a, + input_tensor_b, + std::nullopt, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseXor::invoke( + const Tensor& input_tensor_a, + const Tensor& input_tensor_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ExecuteBitwiseXor::invoke( + ttnn::DefaultQueueId, + input_tensor_a, + input_tensor_b, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseXor::invoke( + uint8_t queue_id, + const Tensor& input_tensor_a, + const int32_t input_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ttnn::bitwise_xor_unary( + queue_id, + input_tensor_a, + input_b, + memory_config, + optional_output_tensor); +} + +Tensor ExecuteBitwiseXor::invoke( + const Tensor& input_tensor_a, + const int32_t input_b, + const std::optional& memory_config, + const std::optional& optional_output_tensor) { + + return ExecuteBitwiseXor::invoke( + ttnn::DefaultQueueId, + input_tensor_a, + input_b, + memory_config, + std::move(optional_output_tensor)); +} + } // namespace ttnn::operations::binary diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp index 692edc87486d..ce524ac4ae6c 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.cpp @@ -13,6 +13,38 @@ namespace ttnn::operations::binary { +namespace utils { + bool is_binary_sfpu_op(BinaryOpType val, DataType a, DataType b) { + switch (val) { + case BinaryOpType::ADD: return ((a == DataType::FLOAT32 && b == DataType::FLOAT32) || (a == DataType::INT32 && b == DataType::INT32)); + case BinaryOpType::SUB: + case BinaryOpType::MUL: + // case BinaryOpType::DIV_FAST: will be enabled after #15780 is resolved + case BinaryOpType::RSUB: + case BinaryOpType::LOGADDEXP: + case BinaryOpType::LOGADDEXP2: + case BinaryOpType::LDEXP: + case BinaryOpType::SQUARED_DIFFERENCE: + case BinaryOpType::LOGICAL_OR: + case BinaryOpType::LOGICAL_XOR: + case BinaryOpType::LOGICAL_AND: + case BinaryOpType::BIAS_GELU: + case BinaryOpType::GT: + case BinaryOpType::LT: + case BinaryOpType::GTE: + case BinaryOpType::LTE: + case BinaryOpType::EQ: + case BinaryOpType::NE: return (a == DataType::FLOAT32 && b == DataType::FLOAT32); + case BinaryOpType::BITWISE_XOR: + case BinaryOpType::BITWISE_AND: + case BinaryOpType::BITWISE_OR: return (a == DataType::INT32 && b == DataType::INT32); + case BinaryOpType::POWER: return true; + default: return false; + } + return false; +} +} // utils + BinaryDeviceOperation::program_factory_t BinaryDeviceOperation::select_program_factory( const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) { ZoneScopedN("BinaryDeviceOperation::select_program_factory"); @@ -31,7 +63,17 @@ BinaryDeviceOperation::program_factory_t BinaryDeviceOperation::select_program_f auto width_b = input_shape_b[-1]; if (height_a == height_b and width_a == width_b) { - return ElementWiseMultiCore{}; + bool device_check = tensor_args.input_tensor_a.device()->arch() != tt::ARCH::GRAYSKULL; + BinaryOpType op = operation_attributes.binary_op_type; + DataType dtype1 = tensor_args.input_tensor_a.get_dtype(); + DataType dtype2 = tensor_args.input_tensor_b->get_dtype(); + bool sfpu_op_check = utils::is_binary_sfpu_op(op, dtype1, dtype2); + + if(device_check && sfpu_op_check){ + return ElementWiseMultiCoreSfpu{}; + } else { + return ElementWiseMultiCore{}; + } } if (height_b == 1 or width_b == 1) { if (height_b == 1 and width_b == 1) { @@ -192,7 +234,7 @@ BinaryDeviceOperation::spec_return_value_t BinaryDeviceOperation::compute_output auto output_shape = compute_broadcasted_output(input_shape_a, input_shape_b); auto program_factory = select_program_factory(operation_attributes, tensor_args); - if (std::holds_alternative(program_factory)) { + if (std::holds_alternative(program_factory) or std::holds_alternative(program_factory)) { const auto& input_tensor_b = *tensor_args.input_tensor_b; if (operation_attributes.memory_config.is_sharded()) { ShardSpec shard_spec{CoreRangeSet(), {0, 0}}; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp index d4c42ca98100..11a77a206e9f 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_device_operation.hpp @@ -77,6 +77,32 @@ struct BinaryDeviceOperation { tensor_return_value_t& tensor_return_value); }; + struct ElementWiseMultiCoreSfpu { + struct shared_variables_t { + KernelHandle binary_reader_kernel_id; + KernelHandle unary_writer_kernel_id; + KernelHandle eltwise_binary_kernel_id; + CBHandle cb_src0; + CBHandle cb_src1; + CBHandle cb_output; + CoreCoord compute_with_storage_grid_size; + uint32_t src0_single_tile_size; + uint32_t src1_single_tile_size; + uint32_t dst_single_tile_size; + }; + using cached_program_t = ttnn::device_operation::CachedProgram; + + static cached_program_t create( + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& tensor_return_value); + + static void override_runtime_arguments( + cached_program_t& cached_program, + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& tensor_return_value); + }; struct BroadcastWidthMultiCore { struct shared_variables_t { KernelHandle binary_reader_kernel_id; @@ -192,6 +218,7 @@ struct BinaryDeviceOperation { using program_factory_t = std::variant< ElementWiseMultiCore, + ElementWiseMultiCoreSfpu, BroadcastWidthMultiCore, BroadcastHeightMultiCore, BroadcastHeightAndWidthMultiCore, diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp new file mode 100644 index 000000000000..f23f9b7fe608 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_sfpu_pgm_factory.cpp @@ -0,0 +1,521 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "binary_device_operation.hpp" +#include "ttnn/operations/eltwise/unary/common/unary_op_types.hpp" + +#include "tt_metal/common/work_split.hpp" + +#include "tt_metal/common/constants.hpp" +#include "tt_metal/detail/util.hpp" +#include "tt_metal/host_api.hpp" + +namespace ttnn::operations::binary { + +template +inline __attribute__((always_inline)) void set_eltwise_binary_sfpu_runtime_args( + Program& program, + const Tensor& a, + const Tensor& b, + const Tensor& output, + const KernelHandle binary_reader_kernel_id, + const KernelHandle unary_writer_kernel_id, + const KernelHandle eltwise_binary_kernel_id, + const CBHandle cb_src0, + const CBHandle cb_src1, + const CBHandle cb_output, + const CoreCoord compute_with_storage_grid_size, + const uint32_t src0_single_tile_size, + const uint32_t src1_single_tile_size, + const uint32_t dst_single_tile_size) { + using namespace tt; + using namespace tt::tt_metal; + using namespace tt::constants; + + auto src_buffer_a = a.buffer(); + auto src_buffer_b = b.buffer(); + auto dst_buffer = output.buffer(); + + CoreRangeSet all_cores, core_group_1, core_group_2; + + std::optional shard_spec = std::nullopt; + std::optional sharded_layout = std::nullopt; + bool src0_sharded = a.memory_config().is_sharded(); + bool src1_sharded = b.memory_config().is_sharded(); + bool out_sharded = output.memory_config().is_sharded(); + + bool block_or_width_sharded = false; + if (src0_sharded) { + shard_spec = a.shard_spec().value(); + block_or_width_sharded = a.memory_config().memory_layout != TensorMemoryLayout::HEIGHT_SHARDED; + sharded_layout = a.memory_config().memory_layout; + } else if (src1_sharded) { + shard_spec = b.shard_spec().value(); + block_or_width_sharded = b.memory_config().memory_layout != TensorMemoryLayout::HEIGHT_SHARDED; + sharded_layout = b.memory_config().memory_layout; + } else if (out_sharded) { + shard_spec = output.shard_spec().value(); + block_or_width_sharded = output.memory_config().memory_layout != TensorMemoryLayout::HEIGHT_SHARDED; + sharded_layout = output.memory_config().memory_layout; + } + + uint32_t num_tiles = a.volume() / TILE_HW; + + uint32_t num_cores_x = compute_with_storage_grid_size.x; + uint32_t num_cores_y = compute_with_storage_grid_size.y; + uint32_t num_cores, num_tiles_per_core_group_1, num_tiles_per_core_group_2; + uint32_t num_cores_total = num_cores_x * num_cores_y; + + uint32_t block_size_per_core_group_1 = 1, block_size_per_core_group_2 = 1, max_block_size = 1; + + uint32_t block_cnt_per_core_group_1, block_cnt_per_core_group_2; + + bool row_major; + uint32_t block_height = 0, block_width = 0, block_size = 0, output_width = 0, last_unpadded_block_height = 0, + last_unpadded_block_width = 0; + CoreCoord end_core; + std::vector cores; + + if (shard_spec.has_value()) { + all_cores = shard_spec.value().grid; + num_cores = all_cores.num_cores(); + core_group_1 = all_cores; + core_group_2 = CoreRangeSet(); + num_tiles_per_core_group_1 = shard_spec.value().shape[0] * shard_spec.value().shape[1] / TILE_HW; + num_tiles_per_core_group_2 = 0; + block_size_per_core_group_1 = find_max_block_size(num_tiles_per_core_group_1); + max_block_size = block_size_per_core_group_1; + + block_cnt_per_core_group_1 = num_tiles_per_core_group_1 / block_size_per_core_group_1; + block_cnt_per_core_group_2 = num_tiles_per_core_group_2 / block_size_per_core_group_2; + row_major = shard_spec.value().orientation == ShardOrientation::ROW_MAJOR; + block_height = shard_spec.value().shape[0] / TILE_HEIGHT; + block_width = shard_spec.value().shape[1] / TILE_WIDTH; + if (block_or_width_sharded) { + block_size = block_width * block_height; + end_core = (*shard_spec.value().grid.ranges().begin()).end_coord; + output_width = output.get_legacy_shape()[-1] / TILE_WIDTH; + uint32_t output_height = output.volume() / output.get_legacy_shape()[-1] / TILE_HEIGHT; + last_unpadded_block_height = block_height - (round_up(output_height, block_height) - output_height); + last_unpadded_block_width = block_width - (round_up(output_width, block_width) - output_width); + } + auto bbox = core_group_1.bounding_box(); + cores = grid_to_cores_with_noop(bbox.end_coord.x, bbox.end_coord.y, num_cores_x, num_cores_y, row_major); + } else { + row_major = true; + std::tie( + num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2) = + tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tiles, row_major); + block_cnt_per_core_group_1 = num_tiles_per_core_group_1; + block_cnt_per_core_group_2 = num_tiles_per_core_group_2; + cores = grid_to_cores(num_cores_total, num_cores_x, num_cores_y, row_major); + } + + uint32_t g1_numcores = core_group_1.num_cores(); + uint32_t g2_numcores = core_group_2.num_cores(); + + std::vector> binary_reader_args; + std::vector> eltwise_binary_args; + std::vector> unary_writer_args; + if constexpr (initialize_args) { + binary_reader_args = {cores.size(), std::vector(7)}; + eltwise_binary_args = {cores.size(), std::vector(2)}; + if (block_or_width_sharded and not out_sharded) { + unary_writer_args = {cores.size(), std::vector(7)}; + } else { + unary_writer_args = {cores.size(), std::vector(3)}; + } + } + + auto& cached_reader_args = GetRuntimeArgs(program, binary_reader_kernel_id); + auto& cached_eltwise_args = GetRuntimeArgs(program, eltwise_binary_kernel_id); + auto& cached_writer_args = GetRuntimeArgs(program, unary_writer_kernel_id); + + for (uint32_t i = 0, num_tiles_read = 0; i < num_cores_total; ++i) { + const CoreCoord& core = cores.at(i); + uint32_t num_tiles_per_core = 0; + uint32_t block_cnt_per_core = 0; + uint32_t block_size_per_core = 0; + uint32_t num_shardes_per_height = 0; + uint32_t num_shardes_per_width = 0; + uint32_t start_id = 0; + if (shard_spec.has_value()) { + if (sharded_layout == tt::tt_metal::TensorMemoryLayout::HEIGHT_SHARDED) { + num_shardes_per_height = num_cores; + num_shardes_per_width = 1; + } else if (sharded_layout == tt::tt_metal::TensorMemoryLayout::WIDTH_SHARDED) { + num_shardes_per_width = num_cores; + num_shardes_per_height = 1; + } else { // block sharded + auto bbox = core_group_1.bounding_box(); + if (shard_spec.value().orientation == ShardOrientation::ROW_MAJOR) { + num_shardes_per_height = bbox.end_coord.y - bbox.start_coord.y + 1; + num_shardes_per_width = bbox.end_coord.x - bbox.start_coord.x + 1; + } else { + num_shardes_per_height = bbox.end_coord.x - bbox.start_coord.x + 1; + num_shardes_per_width = bbox.end_coord.y - bbox.start_coord.y + 1; + } + } + start_id = (i / num_shardes_per_width) * (block_height * block_width * num_shardes_per_width) + + (i % num_shardes_per_width) * block_width; + } else { + start_id = num_tiles_read; + } + + if (i < g1_numcores) { + num_tiles_per_core = num_tiles_per_core_group_1; + block_cnt_per_core = block_cnt_per_core_group_1; + block_size_per_core = block_size_per_core_group_1; + } else if (i < num_cores) { + num_tiles_per_core = num_tiles_per_core_group_2; + block_cnt_per_core = block_cnt_per_core_group_2; + block_size_per_core = block_size_per_core_group_2; + } else { + // Zero out non-working cores RT args. Only necessary in override + // since initialization pushes zero vectors to unused cores. + if constexpr (!initialize_args) { + auto& reader_args = cached_reader_args.at(core.x).at(core.y); + reader_args[2] = 0; + auto& eltwise_args = cached_eltwise_args.at(core.x).at(core.y); + eltwise_args[0] = 0; + auto& writer_args = cached_writer_args.at(core.x).at(core.y); + writer_args[1] = 0; + } + continue; + } + if constexpr (initialize_args) { + binary_reader_args[i] = { + src_buffer_a->address(), + src_buffer_b->address(), + num_tiles_per_core, + start_id, + block_height, + block_width, + num_shardes_per_width, + num_shardes_per_width}; + eltwise_binary_args[i] = {block_cnt_per_core, block_size_per_core}; + } else { + auto& reader_args = cached_reader_args.at(core.x).at(core.y); + reader_args[0] = src_buffer_a->address(); + reader_args[1] = src_buffer_b->address(); + reader_args[2] = num_tiles_per_core; + reader_args[3] = start_id; + reader_args[4] = block_height; + reader_args[5] = block_width; + reader_args[6] = num_shardes_per_width; + auto& eltwise_args = cached_eltwise_args.at(core.x).at(core.y); + eltwise_args[0] = block_cnt_per_core; + eltwise_args[1] = block_size_per_core; + } + if (block_or_width_sharded and not out_sharded) { + uint32_t unpadded_block_height = block_height; + uint32_t unpadded_block_width = block_width; + if (row_major) { + if (core.x == end_core.x) { + unpadded_block_width = last_unpadded_block_width; + } + if (core.y == end_core.y) { + unpadded_block_height = last_unpadded_block_height; + } + } else { + if (core.y == end_core.y) { + unpadded_block_width = last_unpadded_block_width; + } + if (core.x == end_core.x) { + unpadded_block_height = last_unpadded_block_height; + } + } + if constexpr (initialize_args) { + unary_writer_args[i] = { + dst_buffer->address(), + block_height, + block_width, + unpadded_block_height, + unpadded_block_width, + output_width, + block_size, + (i / num_shardes_per_width) * (block_height * block_width * num_shardes_per_width) + + (i % num_shardes_per_width) * block_width, + 0}; + } else { + auto& writer_args = cached_writer_args.at(core.x).at(core.y); + writer_args[0] = dst_buffer->address(); + writer_args[1] = block_height; + writer_args[2] = block_width; + writer_args[3] = unpadded_block_height; + writer_args[4] = unpadded_block_width; + writer_args[5] = output_width; + writer_args[6] = block_size; + writer_args[7] = (i / num_shardes_per_width) * (block_height * block_width * num_shardes_per_width) + + (i % num_shardes_per_width) * block_width; + writer_args[8] = 0; + } + } else { + if constexpr (initialize_args) { + unary_writer_args[i] = {dst_buffer->address(), num_tiles_per_core, num_tiles_read}; + } else { + auto& writer_args = cached_writer_args.at(core.x).at(core.y); + writer_args[0] = dst_buffer->address(); + writer_args[1] = num_tiles_per_core; + writer_args[2] = num_tiles_read; + } + } + num_tiles_read += num_tiles_per_core; + } + + if constexpr (initialize_args) { + SetRuntimeArgs(program, binary_reader_kernel_id, cores, binary_reader_args); + SetRuntimeArgs(program, eltwise_binary_kernel_id, cores, eltwise_binary_args); + SetRuntimeArgs(program, unary_writer_kernel_id, cores, unary_writer_args); + } + + if (src0_sharded) { + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src0, *src_buffer_a, num_tiles_per_core_group_1 * src0_single_tile_size); + } + if (src1_sharded) { + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src1, *src_buffer_b, num_tiles_per_core_group_1 * src1_single_tile_size); + } + if (out_sharded) { + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_output, *dst_buffer, num_tiles_per_core_group_1 * dst_single_tile_size); + } +} +BinaryDeviceOperation::ElementWiseMultiCoreSfpu::cached_program_t +BinaryDeviceOperation::ElementWiseMultiCoreSfpu::create( + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& tensor_return_value) { + using namespace tt; + using namespace tt::tt_metal; + using ttnn::operations::unary::UnaryWithParam; + using namespace tt::constants; + + const auto& a = tensor_args.input_tensor_a; + const auto& b = tensor_args.input_tensor_b; + auto& output = tensor_return_value; + const auto& op_type = operation_attributes.binary_op_type; + + std::vector fused_activations = + operation_attributes.activations.value_or(std::vector{}); + + Program program{}; + + tt::DataFormat src0_cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); + uint32_t src0_single_tile_size = tt_metal::detail::TileSize(src0_cb_data_format); + tt::DataFormat src1_cb_data_format = tt_metal::datatype_to_dataformat_converter(b->get_dtype()); + uint32_t src1_single_tile_size = tt_metal::detail::TileSize(src1_cb_data_format); + tt::DataFormat dst_cb_data_format = tt_metal::datatype_to_dataformat_converter(output.get_dtype()); + uint32_t dst_single_tile_size = tt_metal::detail::TileSize(dst_cb_data_format); + + tt::DataFormat interim_cb0_format = src0_cb_data_format; + tt::DataFormat interim_cb1_format = src1_cb_data_format; + + tt_metal::Buffer* src0_buffer = a.buffer(); + tt_metal::Buffer* src1_buffer = b->buffer(); + + tt_metal::Device* device = a.device(); + + std::optional shard_spec = std::nullopt; + bool src0_sharded = a.memory_config().is_sharded(); + bool src1_sharded = b->memory_config().is_sharded(); + bool out_sharded = output.memory_config().is_sharded(); + + auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); + uint32_t num_cores_x = compute_with_storage_grid_size.x; + uint32_t num_cores_y = compute_with_storage_grid_size.y; + + bool block_or_width_sharded = false; + + if (src0_sharded) { + shard_spec = a.shard_spec().value(); + block_or_width_sharded = a.memory_config().memory_layout != TensorMemoryLayout::HEIGHT_SHARDED; + } else if (src1_sharded) { + shard_spec = b->shard_spec().value(); + block_or_width_sharded = b->memory_config().memory_layout != TensorMemoryLayout::HEIGHT_SHARDED; + } else if (out_sharded) { + shard_spec = output.shard_spec().value(); + block_or_width_sharded = output.memory_config().memory_layout != TensorMemoryLayout::HEIGHT_SHARDED; + } + + uint32_t max_block_size = 1, num_tiles_per_shard = 0; + if (shard_spec.has_value()) { + num_tiles_per_shard = shard_spec.value().shape[0] * shard_spec.value().shape[1] / TILE_HW; + max_block_size = find_max_block_size(num_tiles_per_shard); + } + + tt_metal::Buffer* dst_buffer = output.buffer(); + TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!"); + + auto all_device_cores = CoreRange({0, 0}, {num_cores_x - 1, num_cores_y - 1}); + + uint32_t src0_cb_index = tt::CBIndex::c_0; + uint32_t num_input_tiles = src0_sharded ? num_tiles_per_shard : 2 * max_block_size; + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig(num_input_tiles * src0_single_tile_size, {{src0_cb_index, src0_cb_data_format}}) + .set_page_size(src0_cb_index, src0_single_tile_size); + if (src0_sharded) { + cb_src0_config = cb_src0_config.set_globally_allocated_address(*a.buffer()); + } + auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_src0_config); + + uint32_t src1_cb_index = tt::CBIndex::c_1; + num_input_tiles = src1_sharded ? num_tiles_per_shard : 2 * max_block_size; + tt_metal::CircularBufferConfig cb_src1_config = + tt_metal::CircularBufferConfig(num_input_tiles * src1_single_tile_size, {{src1_cb_index, src1_cb_data_format}}) + .set_page_size(src1_cb_index, src1_single_tile_size); + if (src1_sharded) { + cb_src1_config = cb_src1_config.set_globally_allocated_address(*b->buffer()); + } + auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_src1_config); + + std::map eltwise_defines = utils::get_defines_fp32( + op_type, a.get_dtype(), b->get_dtype(), fused_activations, operation_attributes.input_tensor_a_activation); + + uint32_t src0interim_cb_index = tt::CBIndex::c_3; + if (eltwise_defines.find("SFPU_OP_INIT_PRE_IN0_0") != eltwise_defines.end()) { + uint32_t interim0_single_tile_size = tt_metal::detail::TileSize(interim_cb0_format); + tt_metal::CircularBufferConfig cb_interm_config = + tt_metal::CircularBufferConfig( + max_block_size * interim0_single_tile_size, {{tt::CBIndex::c_3, interim_cb0_format}}) + .set_page_size(tt::CBIndex::c_3, interim0_single_tile_size); + auto cb_interm = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm_config); + } + uint32_t src1interim_cb_index = tt::CBIndex::c_4; + if (eltwise_defines.find("SFPU_OP_INIT_PRE_IN1_0") != eltwise_defines.end()) { + uint32_t interim1_single_tile_size = tt_metal::detail::TileSize(interim_cb1_format); + tt_metal::CircularBufferConfig cb_interm2_config = + tt_metal::CircularBufferConfig( + max_block_size * interim1_single_tile_size, {{tt::CBIndex::c_4, interim_cb1_format}}) + .set_page_size(tt::CBIndex::c_4, interim1_single_tile_size); + auto cb_interm2 = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm2_config); + } + + uint32_t output_cb_index = tt::CBIndex::c_2; + uint32_t num_output_tiles = (out_sharded || block_or_width_sharded) ? num_tiles_per_shard : 2 * max_block_size; + tt_metal::CircularBufferConfig cb_output_config = + tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}}) + .set_page_size(output_cb_index, dst_single_tile_size); + if (out_sharded) { + cb_output_config = cb_output_config.set_globally_allocated_address(*output.buffer()); + } + auto cb_output = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_output_config); + + std::map reader_defines; + if (src0_sharded) { + reader_defines["IN0_SHARDED"] = "1"; + } + if (src1_sharded) { + reader_defines["IN1_SHARDED"] = "1"; + } + std::map writer_defines; + if (out_sharded) { + writer_defines["OUT_SHARDED"] = "1"; + } + + bool src0_is_dram = src0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + bool src1_is_dram = src1_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + std::vector reader_compile_time_args = { + (std::uint32_t)src0_is_dram, (std::uint32_t)src1_is_dram, (std::uint32_t)block_or_width_sharded}; + + bool dst_is_dram = dst_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; + std::vector writer_compile_time_args = {(std::uint32_t)output_cb_index, (std::uint32_t)dst_is_dram}; + + KernelHandle binary_reader_kernel_id = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp", + all_device_cores, + tt_metal::ReaderDataMovementConfig(reader_compile_time_args, reader_defines)); + + KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel( + program, + (block_or_width_sharded and not out_sharded) + ? "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/" + "writer_unary_sharded_blocks_interleaved_start_id.cpp" + : "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp", + all_device_cores, + tt_metal::WriterDataMovementConfig(writer_compile_time_args, writer_defines)); + + bool fp32_dest_acc_en = (dst_cb_data_format == tt::DataFormat::Float32) || + (dst_cb_data_format == tt::DataFormat::Int32) || + (dst_cb_data_format == tt::DataFormat::UInt32); + + std::vector unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default); + unpack_to_dest_mode[src0_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src1_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src0interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; + unpack_to_dest_mode[src1interim_cb_index] = UnpackToDestMode::UnpackToDestFp32; + + + auto eltwise_binary_kernel_id = tt_metal::CreateKernel( + program, + "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp", + all_device_cores, + tt_metal::ComputeConfig{ + .fp32_dest_acc_en = fp32_dest_acc_en, + .unpack_to_dest_mode = unpack_to_dest_mode, + .defines = eltwise_defines}); + + set_eltwise_binary_sfpu_runtime_args( + program, + a, + *b, + output, + binary_reader_kernel_id, + unary_writer_kernel_id, + eltwise_binary_kernel_id, + cb_src0, + cb_src1, + cb_output, + compute_with_storage_grid_size, + src0_single_tile_size, + src1_single_tile_size, + dst_single_tile_size); + + return { + std::move(program), + {binary_reader_kernel_id, + unary_writer_kernel_id, + eltwise_binary_kernel_id, + cb_src0, + cb_src1, + cb_output, + compute_with_storage_grid_size, + src0_single_tile_size, + src1_single_tile_size, + dst_single_tile_size}}; +} + +void BinaryDeviceOperation::ElementWiseMultiCoreSfpu::override_runtime_arguments( + cached_program_t& cached_program, + const operation_attributes_t& operation_attributes, + const tensor_args_t& tensor_args, + tensor_return_value_t& tensor_return_value) { + const auto& input_tensor_a = tensor_args.input_tensor_a; + const auto& input_tensor_b = tensor_args.input_tensor_b; + auto& output_tensor = tensor_return_value; + + const auto& shared_variables = cached_program.shared_variables; + + set_eltwise_binary_sfpu_runtime_args( + cached_program.program, + input_tensor_a, + *input_tensor_b, + output_tensor, + shared_variables.binary_reader_kernel_id, + shared_variables.unary_writer_kernel_id, + shared_variables.eltwise_binary_kernel_id, + shared_variables.cb_src0, + shared_variables.cb_src1, + shared_variables.cb_output, + shared_variables.compute_with_storage_grid_size, + shared_variables.src0_single_tile_size, + shared_variables.src1_single_tile_size, + shared_variables.dst_single_tile_size); +} + +} // namespace ttnn::operations::binary diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp new file mode 100644 index 000000000000..f714d939fb57 --- /dev/null +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_sfpu_kernel.cpp @@ -0,0 +1,145 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/tile_move_copy.h" +#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h" + +#include "compute_kernel_api/common.h" +#include "compute_kernel_api/eltwise_unary/eltwise_unary.h" +#include "compute_kernel_api/eltwise_binary_sfpu.h" +#include "compute_kernel_api/binary_bitwise_sfpu.h" +#include "compute_kernel_api/add_int32_sfpu.h" + +#define PRE_SCALE defined SFPU_OP_INIT_PRE_IN0_0 || defined SFPU_OP_INIT_PRE_IN1_0 + +#if defined(ADD_INT32_INIT) || defined(BITWISE_INIT) +#define INT32_INIT +#endif + +namespace NAMESPACE { +void MAIN { + uint32_t per_core_block_cnt = get_arg_val(0); + uint32_t per_core_block_size = get_arg_val(1); + + constexpr auto cb_in0 = tt::CBIndex::c_0; + constexpr auto cb_in1 = tt::CBIndex::c_1; + +#ifdef SFPU_OP_INIT_PRE_IN0_0 + constexpr auto cb_inp0 = tt::CBIndex::c_3; +#else + constexpr auto cb_inp0 = cb_in0; +#endif + +#ifdef SFPU_OP_INIT_PRE_IN1_0 + constexpr auto cb_inp1 = tt::CBIndex::c_4; +#else + constexpr auto cb_inp1 = cb_in1; +#endif + + constexpr auto cb_out0 = tt::CBIndex::c_2; + + unary_op_init_common(cb_in0, cb_out0); + +#ifdef PACK_RELU + PACK((llk_pack_relu_config(ReluType::ZERO_RELU))); +#endif + + for (uint32_t block = 0; block < per_core_block_cnt; ++block) { + +#if PRE_SCALE + copy_tile_to_dst_init_short(); // need to copy from CB to DST to be able to run sfpu math +#endif + +#ifdef SFPU_OP_INIT_PRE_IN0_0 + cb_wait_front(cb_in0, per_core_block_size); + cb_reserve_back(cb_inp0, per_core_block_size); + + tile_regs_acquire(); + SFPU_OP_INIT_PRE_IN0_0 + for (uint32_t i = 0; i < per_core_block_size; ++i) { + copy_tile(cb_in0, i, i); // copy from c_in[0] to DST[0] + SFPU_OP_FUNC_PRE_IN0_0 + } + tile_regs_commit(); + + tile_regs_wait(); + for (uint32_t i = 0; i < per_core_block_size; ++i) { + pack_tile(i, cb_inp0); // DST[0]->cb + } + tile_regs_release(); + + cb_pop_front(cb_in0, per_core_block_size); + cb_push_back(cb_inp0, per_core_block_size); +#endif + +#ifdef SFPU_OP_INIT_PRE_IN1_0 + cb_wait_front(cb_in1, per_core_block_size); + cb_reserve_back(cb_inp1, per_core_block_size); + + tile_regs_acquire(); + SFPU_OP_INIT_PRE_IN1_0 + for (uint32_t i = 0; i < per_core_block_size; ++i) { + copy_tile(cb_in1, i, i); // copy from c_in[0] to DST[0] + SFPU_OP_FUNC_PRE_IN1_0 + } + tile_regs_commit(); + + tile_regs_wait(); + for (uint32_t i = 0; i < per_core_block_size; ++i) { + pack_tile(i, cb_inp1); // DST[0]->cb + } + tile_regs_release(); + + cb_pop_front(cb_in1, per_core_block_size); + cb_push_back(cb_inp1, per_core_block_size); +#endif + cb_wait_front(cb_inp0, per_core_block_size); + cb_wait_front(cb_inp1, per_core_block_size); + cb_reserve_back(cb_out0, per_core_block_size); + + tile_regs_acquire(); + tile_regs_wait(); + copy_tile_to_dst_init_short_with_dt(cb_inp1, cb_inp0); + for (uint32_t i = 0; i < per_core_block_size; ++i) { + copy_tile(cb_inp0, i, i * 2); + } + copy_tile_to_dst_init_short_with_dt(cb_inp0, cb_inp1); + for (uint32_t i = 0; i < per_core_block_size; ++i) { + copy_tile(cb_inp1, i, i * 2 + 1); + +#ifndef INT32_INIT + eltwise_binop_tile_init(); +#endif + +#ifdef ADD_INT32_INIT + ADD_INT32_INIT +#endif +#ifdef BITWISE_INIT + BITWISE_INIT +#endif + +#ifdef BINARY_SFPU_OP + BINARY_SFPU_OP +#endif +#ifdef SFPU_OP_INIT_0 + SFPU_OP_INIT_0 + SFPU_OP_FUNC_0 +#endif + +#ifdef SFPU_OP_CHAIN_0 + SFPU_OP_CHAIN_0 +#endif + pack_tile(i * 2, cb_out0); + } + tile_regs_commit(); + tile_regs_release(); + + cb_pop_front(cb_inp0, per_core_block_size); + cb_pop_front(cb_inp1, per_core_block_size); + cb_push_back(cb_out0, per_core_block_size); + } +} +} // namespace NAMESPACE diff --git a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp index bc67eb9a10f8..5e261d58cce8 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/ternary_backward/ternary_backward.cpp @@ -7,7 +7,7 @@ #include "ttnn/operations/eltwise/binary/binary.hpp" #include "ttnn/operations/data_movement/bcast/bcast.hpp" #include "ttnn/cpp/ttnn/operations/eltwise/ternary/where.hpp" - +#include "ttnn/operations/eltwise/binary/binary_composite.hpp" #include "tt_metal/common/constants.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/tools/profiler/op_profiler.hpp" diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp index a30bdc2cc5ee..a170e3d98558 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp @@ -43,7 +43,7 @@ Tensor _tanhshrink(const Tensor& x, const std::optional& output_me } // power - floating point exponent -Tensor _power( +Tensor ExecutePower::invoke( uint8_t queue_id, const Tensor& input_a, float exponent, @@ -79,8 +79,17 @@ Tensor _power( return result; } +// power - floating point exponent +Tensor ExecutePower::invoke( + const Tensor& input_a, + float exponent, + const std::optional& output_mem_config, + std::optional output_tensor) { + return ExecutePower::invoke(DefaultQueueId, input_a, exponent, output_mem_config, std::move(output_tensor)); +} + // power - integer exponent -Tensor _power( +Tensor ExecutePower::invoke( uint8_t queue_id, const Tensor& input, uint32_t exponent, @@ -89,6 +98,54 @@ Tensor _power( return ttnn::power(queue_id, input, exponent, output_mem_config, output_tensor); } +// power - integer exponent +Tensor ExecutePower::invoke( + const Tensor& input, + uint32_t exponent, + const std::optional& output_mem_config, + std::optional output_tensor) { + return ExecutePower::invoke(DefaultQueueId, input, exponent, output_mem_config, std::move(output_tensor)); +} + +// power - tensor exponent +Tensor ExecutePower::invoke( + uint8_t queue_id, + const Tensor& input, + const Tensor& exponent, + const std::optional& output_mem_config, + std::optional output_tensor) { + return ttnn::power_binary(queue_id, input, exponent, std::nullopt, output_mem_config, output_tensor); +} + +// power - tensor exponent +Tensor ExecutePower::invoke( + const Tensor& input, + const Tensor& exponent, + const std::optional& output_mem_config, + std::optional output_tensor) { + return ExecutePower::invoke(DefaultQueueId, input, exponent, output_mem_config, std::move(output_tensor)); +} + +// power - scalar input +Tensor ExecutePower::invoke( + uint8_t queue_id, + float input_a, + const Tensor& exponent, + const std::optional& output_mem_config, + std::optional output_tensor) { + Tensor input = ttnn::full_like(exponent, input_a); + return ExecutePower::invoke(queue_id, input, exponent, output_mem_config, std::move(output_tensor)); +} + +// power - scalar input +Tensor ExecutePower::invoke( + float input_a, + const Tensor& exponent, + const std::optional& output_mem_config, + std::optional output_tensor) { + return ExecutePower::invoke(DefaultQueueId, input_a, exponent, output_mem_config, std::move(output_tensor)); +} + // acosh(x) = log(x + sqrt(x^2 - 1)) Tensor _acosh(const Tensor& input_a, const std::optional& output_mem_config) { TT_FATAL(input_a.storage_type() == StorageType::DEVICE, "Unary operation requires input to be on Device."); diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp index ed6dca8a7d33..584ec6fadb57 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp @@ -98,8 +98,6 @@ Tensor _glu(const Tensor&, int32_t, const std::optional&); Tensor _reglu(const Tensor&, int32_t, const std::optional&); Tensor _geglu(const Tensor&, int32_t, const std::optional&); Tensor _swiglu(const Tensor&, int32_t, const std::optional&); -Tensor _power(uint8_t, const Tensor&, float, const std::optional&, std::optional); -Tensor _power(uint8_t, const Tensor&, uint32_t, const std::optional&, std::optional); Tensor _tril(const Tensor&, int32_t diag = 0, const std::optional& output_mem_config = std::nullopt); Tensor _triu(const Tensor&, int32_t diag = 0, const std::optional& output_mem_config = std::nullopt); Tensor _round(const Tensor&, int32_t decimal = 0, const std::optional& output_mem_config = std::nullopt); @@ -313,26 +311,6 @@ struct OpHandler { } }; -template <> -struct OpHandler { - static Tensor handle( - uint8_t q_id, - const Tensor& input, - float exponent, - const std::optional& mem_cfg, - std::optional output) { - return _power(q_id, input, exponent, mem_cfg, output); - } - static Tensor handle( - uint8_t q_id, - const Tensor& input, - uint32_t exponent, - const std::optional& mem_cfg, - std::optional output) { - return _power(q_id, input, exponent, mem_cfg, output); - } -}; - template <> struct OpHandler { static Tensor handle(const Tensor& t1, float lambd, const std::optional& mem_cfg) { diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp index 7c034bfe66e7..a5a8d89087de 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary.hpp @@ -341,7 +341,7 @@ REGISTER_UNARY_OPERATION_WITH_FAST_AND_APPROXIMATE_MODE(rsqrt, RSQRT); // Unaries with float parameter REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(elu, ELU); -REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(rsub, RSUB); +REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(rsub_unary, RSUB); REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(heaviside, HEAVISIDE); REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(leaky_relu, LEAKY_RELU); REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(relu_max, RELU_MAX); @@ -357,9 +357,9 @@ REGISTER_UNARY_OPERATION_WITH_FLOAT_PARAMETER(ne_unary, UNARY_NE); REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(power, POWER, uint32_t); REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_left_shift, LEFT_SHIFT, int32_t); REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_right_shift, RIGHT_SHIFT, int32_t); -REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_and, BITWISE_AND, int32_t); -REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_or, BITWISE_OR, int32_t); -REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_xor, BITWISE_XOR, int32_t); +REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_and_unary, BITWISE_AND, int32_t); +REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_or_unary, BITWISE_OR, int32_t); +REGISTER_UNARY_OPERATION_WITH_INTEGER_PARAMETER(bitwise_xor_unary, BITWISE_XOR, int32_t); // Other unaries constexpr auto dropout = diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp index b7eb155973ce..a523e09cb303 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp @@ -12,60 +12,67 @@ namespace ttnn { namespace operations { namespace unary { +/** + * @brief Performs element-wise power operation on the input with the exponent. + * When exponent is Tensor, the supported dtypes are float32 and bfloat16. + * The tested range for the input is (-30,30) and for the exponent is (-20, 20). + * + * @param input The input tensor, i.e the base. + * @param exponent The exponent + * @return The result tensor + */ struct ExecutePower { static Tensor invoke( uint8_t queue_id, const Tensor& input_tensor, uint32_t exponent, const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt) { - return OpHandler::handle( - queue_id, - input_tensor, - exponent, - memory_config.value_or(input_tensor.memory_config()), - optional_output_tensor); - } + std::optional optional_output_tensor = std::nullopt); static Tensor invoke( const Tensor& input_tensor, uint32_t exponent, const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt) { - return OpHandler::handle( - DefaultQueueId, - input_tensor, - exponent, - memory_config.value_or(input_tensor.memory_config()), - optional_output_tensor); - } + std::optional optional_output_tensor = std::nullopt); static Tensor invoke( uint8_t queue_id, const Tensor& input_tensor, float exponent, const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt) { - return OpHandler::handle( - queue_id, - input_tensor, - exponent, - memory_config.value_or(input_tensor.memory_config()), - optional_output_tensor); - } + std::optional optional_output_tensor = std::nullopt); static Tensor invoke( const Tensor& input_tensor, float exponent, const std::optional& memory_config = std::nullopt, - std::optional optional_output_tensor = std::nullopt) { - return OpHandler::handle( - DefaultQueueId, - input_tensor, - exponent, - memory_config.value_or(input_tensor.memory_config()), - optional_output_tensor); - } + std::optional optional_output_tensor = std::nullopt); + + static Tensor invoke( + uint8_t queue_id, + float input_a, + const Tensor& exponent, + const std::optional& memory_config = std::nullopt, + std::optional optional_output_tensor = std::nullopt); + + static Tensor invoke( + float input_a, + const Tensor& exponent, + const std::optional& memory_config = std::nullopt, + std::optional optional_output_tensor = std::nullopt); + + static Tensor invoke( + uint8_t queue_id, + const Tensor& input_tensor, + const Tensor& exponent, + const std::optional& memory_config = std::nullopt, + std::optional optional_output_tensor = std::nullopt); + + static Tensor invoke( + const Tensor& input_tensor, + const Tensor& exponent, + const std::optional& memory_config = std::nullopt, + std::optional optional_output_tensor = std::nullopt); }; template diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp index eb3fb90bc0f4..5f0a6514b351 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp @@ -979,10 +979,10 @@ template void bind_power(py::module& module, const unary_operation_t& operation, const std::string& note = "") { auto doc = fmt::format( R"doc( - Applies {0} to :attr:`input_tensor` element-wise. + Perform element-wise {0} operation on :attr:`input_tensor` with :attr:`exponent`. .. math:: - \mathrm{{output\_tensor}}_i = \verb|{0}|(\mathrm{{input\_tensor}}_i) + \mathrm{{output\_tensor}}_i = \verb|{0}|(\mathrm{{input\_tensor}}_i ** \mathrm{{exponent}}_i) Args: input_tensor (ttnn.Tensor): the input tensor. @@ -1056,7 +1056,41 @@ void bind_power(py::module& module, const unary_operation_t& operation, const st py::kw_only(), py::arg("memory_config") = std::nullopt, py::arg("output_tensor") = std::nullopt, - py::arg("queue_id") = 0} + py::arg("queue_id") = ttnn::DefaultQueueId}, + + // tensor exponent + ttnn::pybind_overload_t{ + [](const unary_operation_t& self, + const Tensor& input_tensor, + const Tensor& exponent, + const std::optional& memory_config, + std::optional output_tensor, + const uint8_t queue_id) -> ttnn::Tensor { + return self(queue_id, input_tensor, exponent, memory_config, output_tensor); + }, + py::arg("input_tensor"), + py::arg("exponent"), + py::kw_only(), + py::arg("memory_config") = std::nullopt, + py::arg("output_tensor") = std::nullopt, + py::arg("queue_id") = ttnn::DefaultQueueId}, + + // scalar input - tensor exponent + ttnn::pybind_overload_t{ + [](const unary_operation_t& self, + float input, + const Tensor& exponent, + const std::optional& memory_config, + std::optional output_tensor, + const uint8_t queue_id) -> ttnn::Tensor { + return self(queue_id, input, exponent, memory_config, output_tensor); + }, + py::arg("input"), + py::arg("exponent"), + py::kw_only(), + py::arg("memory_config") = std::nullopt, + py::arg("output_tensor") = std::nullopt, + py::arg("queue_id") = ttnn::DefaultQueueId} ); } @@ -1815,9 +1849,6 @@ void py_module(py::module& module) { // Unaries with float parameter detail::bind_unary_operation_with_float_parameter(module, ttnn::elu, "alpha", "The alpha parameter for the ELU function","",R"doc(BFLOAT16, BFLOAT8_B)doc"); - detail::bind_unary_operation_with_float_parameter(module, ttnn::rsub, "value", "subtrahent value which is actually calculated as minuend", - "Returns tensor with respective elements of the input tensor subtracted from the value.", R"doc(BFLOAT16, BFLOAT8_B)doc", - R"doc(System memory is not supported.)doc"); detail::bind_unary_operation_with_float_parameter(module, ttnn::heaviside, "value", "The value parameter for the Heaviside function", "", R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_unary_operation_with_float_parameter(module, ttnn::leaky_relu, "negative_slope", "The slope parameter for the Leaky ReLU function", "",R"doc(BFLOAT16, BFLOAT8_B)doc"); detail::bind_unary_operation_with_float_parameter(module, ttnn::fill, "fill_value", "The value to be filled in the output tensor", @@ -1832,10 +1863,6 @@ void py_module(py::module& module) { // Unaries with integer parameter detail::bind_unary_operation_with_integer_parameter(module, ttnn::bitwise_left_shift, "shift_bits", "integer within range (0, 31)", "INT32", "Support provided for Wormhole_B0 only."); detail::bind_unary_operation_with_integer_parameter(module, ttnn::bitwise_right_shift, "shift_bits", "integer within range (0, 31)", "INT32", "Support provided for Wormhole_B0 only."); - detail::bind_unary_operation_with_integer_parameter(module, ttnn::bitwise_and, "value", "scalar value", "INT32", "Input tensor needs to be positive. Support provided only for Wormhole_B0."); - detail::bind_unary_operation_with_integer_parameter(module, ttnn::bitwise_or, "value", "scalar value", "INT32", "Input tensor needs to be positive. Support provided only for Wormhole_B0."); - detail::bind_unary_operation_with_integer_parameter(module, ttnn::bitwise_xor, "value", "scalar value","INT32", "Input tensor needs to be positive. Support provided only for Wormhole_B0."); - // Unary ops with dim parameter detail::bind_unary_operation_with_dim_parameter( @@ -1896,7 +1923,7 @@ void py_module(py::module& module) { detail::bind_sigmoid_accurate(module, ttnn::sigmoid_accurate); detail::bind_unary_chain(module, ttnn::unary_chain); detail::bind_identity(module, ttnn::identity); - detail::bind_power(module, ttnn::pow); + detail::bind_power(module, ttnn::pow, R"doc(When :attr:`exponent` is a Tensor, supported dtypes are: BFLOAT16, FLOAT32)doc"); // unary composite imported into ttnn detail::bind_unary_composite(module, ttnn::deg2rad, R"doc(Performs deg2rad function on :attr:`input_tensor`.)doc", "", R"doc(BFLOAT16, BFLOAT8_B)doc"); diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp index f6f8db7c40c8..423a0a6775ca 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp @@ -24,6 +24,7 @@ #include "ttnn/operations/eltwise/complex_unary/complex_unary.hpp" #include "ttnn/operations/eltwise/complex_binary/device/complex_binary_op.hpp" #include "ttnn/operations/reduction/generic/generic_reductions.hpp" +#include "ttnn/operations/eltwise/binary/binary_composite.hpp" namespace ttnn::operations::unary_backward { From e8465bc101b3e98721bd094309f8448f1800ae6f Mon Sep 17 00:00:00 2001 From: Nilaykumar Patel Date: Wed, 11 Dec 2024 14:28:07 +0530 Subject: [PATCH 48/59] #15814: Refactor tensor_utils.cpp. Signed-off-by: Nilaykumar Patel --- ttnn/cpp/ttnn/tensor/tensor_utils.cpp | 410 ++++++++------------------ ttnn/cpp/ttnn/tensor/tensor_utils.hpp | 2 +- 2 files changed, 132 insertions(+), 280 deletions(-) diff --git a/ttnn/cpp/ttnn/tensor/tensor_utils.cpp b/ttnn/cpp/ttnn/tensor/tensor_utils.cpp index ef1e685998d5..ef6cf99e223e 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_utils.cpp +++ b/ttnn/cpp/ttnn/tensor/tensor_utils.cpp @@ -6,7 +6,7 @@ #include "ttnn/distributed/api.hpp" #include "ttnn/tensor/host_buffer/functions.hpp" -#include "ttnn/tensor/host_buffer/types.hpp" +#include "ttnn/tensor/types.hpp" namespace tt { @@ -32,6 +32,55 @@ Tensor convert_tensor(const Tensor& input_tensor, compute_& compute) { return ttnn::distributed::is_multi_device_tensor(input_tensor) ? transform(input_tensor, convert_tensor) : convert_tensor(input_tensor); } +template +Tensor convert_tensor_to_tiled_layout_common( + const Tensor& input_tensor, + std::optional output_dtype, + const std::unordered_map& function_map, + Args&&... args) { + TT_ASSERT( + input_tensor.get_layout() == Layout::ROW_MAJOR && + "Tensor(weight/bias) should be in row major layout for conversion to tilized layout."); + + if (output_dtype.has_value()) { + if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { + TT_ASSERT(input_tensor.get_dtype() == DataType::FLOAT32); + } else { + TT_ASSERT(input_tensor.get_dtype() == input_tensor.get_dtype()); + } + } + auto entry = function_map.find(input_tensor.get_dtype()); + if (entry == function_map.end()) { + TT_THROW("Unsupported data type"); + } + return entry->second(input_tensor, std::forward(args)..., output_dtype.value_or(input_tensor.get_dtype())); +} + +template +Tensor create_tensor_from_owned_buffer( + owned_buffer::Buffer& buf, DataType& output_dtype, ttnn::SimpleShape& output_shape) { + if constexpr (std::is_same::value) { + if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { + auto tensor = + Tensor(std::move(OwnedStorage{std::move(buf)}), output_shape, DataType::FLOAT32, Layout::ROW_MAJOR) + .to(Layout::TILE); + auto output_float_data = owned_buffer::get_as(tensor).get(); + auto output_packed_data = + output_dtype == DataType::BFLOAT8_B + ? pack_fp32_vec_as_bfp8_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false) + : pack_fp32_vec_as_bfp4_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false); + auto output_uint32_buffer = owned_buffer::create(std::move(output_packed_data)); + return Tensor( + std::move(OwnedStorage{std::move(output_uint32_buffer)}), output_shape, output_dtype, Layout::TILE); + } + } else { + TT_FATAL( + (output_dtype != DataType::BFLOAT8_B) || (output_dtype != DataType::BFLOAT4_B), + "Unsupported output datatype"); + } + auto rm_tensor = Tensor(std::move(OwnedStorage{std::move(buf)}), output_shape, output_dtype, Layout::ROW_MAJOR); + return rm_tensor.to(Layout::TILE); +} template Tensor to_weight_special_padding_tile_layout( @@ -65,29 +114,7 @@ Tensor to_weight_special_padding_tile_layout( } } } - if constexpr (std::is_same::value) { - if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { - auto tensor = Tensor( - std::move(OwnedStorage{std::move(output_buffer)}), - output_shape, - DataType::FLOAT32, - Layout::ROW_MAJOR) - .to(Layout::TILE); - auto output_float_data = owned_buffer::get_as(tensor).get(); - auto output_packed_data = - output_dtype == DataType::BFLOAT8_B - ? pack_fp32_vec_as_bfp8_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false) - : pack_fp32_vec_as_bfp4_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false); - auto output_uint32_buffer = owned_buffer::create(std::move(output_packed_data)); - return Tensor( - std::move(OwnedStorage{std::move(output_uint32_buffer)}), output_shape, output_dtype, Layout::TILE); - } - } else { - TT_ASSERT((output_dtype != DataType::BFLOAT8_B) || (output_dtype != DataType::BFLOAT4_B)); - } - auto rm_tensor = - Tensor(std::move(OwnedStorage{std::move(output_buffer)}), output_shape, output_dtype, Layout::ROW_MAJOR); - return rm_tensor.to(Layout::TILE); + return create_tensor_from_owned_buffer(output_buffer, output_dtype, output_shape); }; return convert_tensor(conv_weight_tensor, compute); } @@ -126,29 +153,7 @@ Tensor to_weight_tile_layout( } } } - if constexpr (std::is_same::value) { - if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { - auto tensor = Tensor( - std::move(OwnedStorage{std::move(output_buffer)}), - output_shape, - DataType::FLOAT32, - Layout::ROW_MAJOR) - .to(Layout::TILE); - auto output_float_data = owned_buffer::get_as(tensor).get(); - auto output_packed_data = - output_dtype == DataType::BFLOAT8_B - ? pack_fp32_vec_as_bfp8_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false) - : pack_fp32_vec_as_bfp4_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false); - auto output_uint32_buffer = owned_buffer::create(std::move(output_packed_data)); - return Tensor( - std::move(OwnedStorage{std::move(output_uint32_buffer)}), output_shape, output_dtype, Layout::TILE); - } - } else { - TT_ASSERT((output_dtype != DataType::BFLOAT8_B) || (output_dtype != DataType::BFLOAT4_B)); - } - auto rm_tensor = - Tensor(std::move(OwnedStorage{std::move(output_buffer)}), output_shape, output_dtype, Layout::ROW_MAJOR); - return rm_tensor.to(Layout::TILE); + return create_tensor_from_owned_buffer(output_buffer, output_dtype, output_shape); }; return convert_tensor(conv_weight_tensor, compute); @@ -161,30 +166,14 @@ Tensor convert_conv_weight_tensor_to_tiled_layout( uint32_t in1_block_h, uint32_t in1_block_w, std::optional output_dtype) { - TT_ASSERT( - conv_weight_tensor.get_layout() == Layout::ROW_MAJOR && - "Convolution weights should be in row major layout for conversion to tilized layout."); - - if (output_dtype.has_value()) { - if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { - TT_ASSERT(conv_weight_tensor.get_dtype() == DataType::FLOAT32); - } else { - TT_ASSERT(conv_weight_tensor.get_dtype() == conv_weight_tensor.get_dtype()); - } - } - - switch (conv_weight_tensor.get_dtype()) { - case DataType::BFLOAT16: - return to_weight_tile_layout( - conv_weight_tensor, in1_block_h, in1_block_w, output_dtype.value_or(conv_weight_tensor.get_dtype())); - case DataType::FLOAT32: - return to_weight_tile_layout( - conv_weight_tensor, in1_block_h, in1_block_w, output_dtype.value_or(conv_weight_tensor.get_dtype())); - case DataType::UINT32: - return to_weight_tile_layout( - conv_weight_tensor, in1_block_h, in1_block_w, output_dtype.value_or(conv_weight_tensor.get_dtype())); - default: TT_THROW("Unsupported data type"); - } + const static std::unordered_map> + to_w_tile_layout_map = { + {DataType::BFLOAT16, &to_weight_tile_layout}, + {DataType::FLOAT32, &to_weight_tile_layout}, + {DataType::UINT32, &to_weight_tile_layout}}; + + return convert_tensor_to_tiled_layout_common( + conv_weight_tensor, output_dtype, to_w_tile_layout_map, in1_block_h, in1_block_w); } template @@ -236,41 +225,7 @@ Tensor to_weight_tile_layout_block_sharded( } } } - if constexpr (std::is_same::value) { - if (output_dtype == DataType::BFLOAT8_B) { - auto tensor = Tensor( - std::move(OwnedStorage{std::move(output_buffer)}), - output_shape, - DataType::FLOAT32, - Layout::ROW_MAJOR) - .to(Layout::TILE); - auto output_float_data = owned_buffer::get_as(tensor).get(); - auto output_packed_data = - pack_fp32_vec_as_bfp8_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false); - auto output_uint32_buffer = owned_buffer::create(std::move(output_packed_data)); - return Tensor( - std::move(OwnedStorage{std::move(output_uint32_buffer)}), output_shape, output_dtype, Layout::TILE); - } - if (output_dtype == DataType::BFLOAT4_B) { - auto tensor = Tensor( - std::move(OwnedStorage{std::move(output_buffer)}), - output_shape, - DataType::FLOAT32, - Layout::ROW_MAJOR) - .to(Layout::TILE); - auto output_float_data = owned_buffer::get_as(tensor).get(); - auto output_packed_data = - pack_fp32_vec_as_bfp4_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false); - auto output_uint32_buffer = owned_buffer::create(std::move(output_packed_data)); - return Tensor( - std::move(OwnedStorage{std::move(output_uint32_buffer)}), output_shape, output_dtype, Layout::TILE); - } - } else { - TT_ASSERT((output_dtype != DataType::BFLOAT8_B) || (output_dtype != DataType::BFLOAT4_B)); - } - auto rm_tensor = - Tensor(std::move(OwnedStorage{std::move(output_buffer)}), output_shape, output_dtype, Layout::ROW_MAJOR); - return rm_tensor.to(Layout::TILE); + return create_tensor_from_owned_buffer(output_buffer, output_dtype, output_shape); }; return convert_tensor(conv_weight_tensor, compute); } @@ -279,25 +234,14 @@ Tensor to_weight_tile_layout_block_sharded( // Returns a new tensor with layout=Tile Tensor convert_conv_weight_tensor_to_tiled_layout_block_sharded( const Tensor& conv_weight_tensor, uint32_t num_channel_shards, std::optional output_dtype) { - TT_ASSERT( - conv_weight_tensor.get_layout() == Layout::ROW_MAJOR && - "Convolution weights should be in row major layout for conversion to tilized layout."); - const static std:: - map> - to_w_tile_layout_map = { - {DataType::BFLOAT16, &to_weight_tile_layout_block_sharded}, - {DataType::FLOAT32, &to_weight_tile_layout_block_sharded}, - {DataType::UINT32, &to_weight_tile_layout_block_sharded}, - }; - if (output_dtype.has_value()) { - if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { - TT_ASSERT(conv_weight_tensor.get_dtype() == DataType::FLOAT32); - } else { - TT_ASSERT(conv_weight_tensor.get_dtype() == conv_weight_tensor.get_dtype()); - } - } - return to_w_tile_layout_map.at(conv_weight_tensor.get_dtype())( - conv_weight_tensor, num_channel_shards, output_dtype.value_or(conv_weight_tensor.get_dtype())); + const static std::unordered_map> + to_w_tile_layout_map = { + {DataType::BFLOAT16, &to_weight_tile_layout_block_sharded}, + {DataType::FLOAT32, &to_weight_tile_layout_block_sharded}, + {DataType::UINT32, &to_weight_tile_layout_block_sharded}}; + + return convert_tensor_to_tiled_layout_common( + conv_weight_tensor, output_dtype, to_w_tile_layout_map, num_channel_shards); } template @@ -327,41 +271,7 @@ Tensor to_bias_tile_layout_block_sharded( output_buffer[matrix_idx] = input_buffer[idx]; } } - if constexpr (std::is_same::value) { - if (output_dtype == DataType::BFLOAT8_B) { - auto tensor = Tensor( - std::move(OwnedStorage{std::move(output_buffer)}), - output_shape, - DataType::FLOAT32, - Layout::ROW_MAJOR) - .to(Layout::TILE); - auto output_float_data = owned_buffer::get_as(tensor).get(); - auto output_packed_data = - pack_fp32_vec_as_bfp8_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false); - auto output_uint32_buffer = owned_buffer::create(std::move(output_packed_data)); - return Tensor( - std::move(OwnedStorage{std::move(output_uint32_buffer)}), output_shape, output_dtype, Layout::TILE); - } - if (output_dtype == DataType::BFLOAT4_B) { - auto tensor = Tensor( - std::move(OwnedStorage{std::move(output_buffer)}), - output_shape, - DataType::FLOAT32, - Layout::ROW_MAJOR) - .to(Layout::TILE); - auto output_float_data = owned_buffer::get_as(tensor).get(); - auto output_packed_data = - pack_fp32_vec_as_bfp4_tiles(output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false); - auto output_uint32_buffer = owned_buffer::create(std::move(output_packed_data)); - return Tensor( - std::move(OwnedStorage{std::move(output_uint32_buffer)}), output_shape, output_dtype, Layout::TILE); - } - } else { - TT_ASSERT((output_dtype != DataType::BFLOAT8_B) || (output_dtype != DataType::BFLOAT4_B)); - } - auto rm_tensor = - Tensor(std::move(OwnedStorage{std::move(output_buffer)}), output_shape, output_dtype, Layout::ROW_MAJOR); - return rm_tensor.to(Layout::TILE); + return create_tensor_from_owned_buffer(output_buffer, output_dtype, output_shape); }; return convert_tensor(conv_bias_tensor, compute); @@ -371,25 +281,16 @@ Tensor to_bias_tile_layout_block_sharded( // Returns a new tensor with layout=Tile Tensor convert_conv_bias_tensor_to_tiled_layout_block_sharded( const Tensor& conv_bias_tensor, uint32_t num_channel_shards, std::optional output_dtype) { - TT_ASSERT( - conv_bias_tensor.get_layout() == Layout::ROW_MAJOR && - "Convolution weights should be in row major layout for conversion to tilized layout."); - const static std:: - map> - to_b_tile_layout_map = { - {DataType::BFLOAT16, &to_bias_tile_layout_block_sharded}, - {DataType::FLOAT32, &to_bias_tile_layout_block_sharded}, - {DataType::UINT32, &to_bias_tile_layout_block_sharded}, - }; - if (output_dtype.has_value()) { - if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { - TT_ASSERT(conv_bias_tensor.get_dtype() == DataType::FLOAT32); - } else { - TT_ASSERT(conv_bias_tensor.get_dtype() == conv_bias_tensor.get_dtype()); - } - } - return to_b_tile_layout_map.at(conv_bias_tensor.get_dtype())( - conv_bias_tensor, num_channel_shards, output_dtype.value_or(conv_bias_tensor.get_dtype())); + const static std::unordered_map< + DataType, + std::function> + to_b_tile_layout_map = { + {DataType::BFLOAT16, &to_bias_tile_layout_block_sharded}, + {DataType::FLOAT32, &to_bias_tile_layout_block_sharded}, + {DataType::UINT32, &to_bias_tile_layout_block_sharded}, + }; + return convert_tensor_to_tiled_layout_common( + conv_bias_tensor, output_dtype, to_b_tile_layout_map, num_channel_shards); } // Converts convolution weights to tilized 2d matrix layout. @@ -399,30 +300,14 @@ Tensor convert_conv_weight_tensor_to_special_padding_tiled_layout( uint32_t in1_block_h, uint32_t in1_block_w, std::optional output_dtype) { - TT_ASSERT( - conv_weight_tensor.get_layout() == Layout::ROW_MAJOR && - "Convolution weights should be in row major layout for conversion to tilized layout."); - - if (output_dtype.has_value()) { - if (output_dtype == DataType::BFLOAT8_B || output_dtype == DataType::BFLOAT4_B) { - TT_ASSERT(conv_weight_tensor.get_dtype() == DataType::FLOAT32); - } else { - TT_ASSERT(conv_weight_tensor.get_dtype() == conv_weight_tensor.get_dtype()); - } - } - - switch (conv_weight_tensor.get_dtype()) { - case DataType::BFLOAT16: - return to_weight_special_padding_tile_layout( - conv_weight_tensor, in1_block_h, in1_block_w, output_dtype.value_or(conv_weight_tensor.get_dtype())); - case DataType::FLOAT32: - return to_weight_special_padding_tile_layout( - conv_weight_tensor, in1_block_h, in1_block_w, output_dtype.value_or(conv_weight_tensor.get_dtype())); - case DataType::UINT32: - return to_weight_special_padding_tile_layout( - conv_weight_tensor, in1_block_h, in1_block_w, output_dtype.value_or(conv_weight_tensor.get_dtype())); - default: TT_THROW("Unsupported data type"); - } + const static std::unordered_map> + to_w_tile_layout_map = { + {DataType::BFLOAT16, &to_weight_special_padding_tile_layout}, + {DataType::FLOAT32, &to_weight_special_padding_tile_layout}, + {DataType::UINT32, &to_weight_special_padding_tile_layout}}; + + return convert_tensor_to_tiled_layout_common( + conv_weight_tensor, output_dtype, to_w_tile_layout_map, in1_block_h, in1_block_w); } /* @@ -478,7 +363,7 @@ Helper function to aid in converting depthwise weight tensor to broadcasted weig */ template static Tensor conv_depthwise_weight_bcast_helper( - Tensor& conv_weight_tensor, + const Tensor& conv_weight_tensor, const ttnn::SimpleShape& original_weight_shape, const ttnn::SimpleShape& output_weight_shape, DataType output_dtype) { @@ -514,10 +399,6 @@ divided into num_groups for each groupped filter */ Tensor convert_conv_weight_tensor_to_grouped_layout( const Tensor& conv_weight_tensor, uint32_t num_groups, DataType output_dtype) { - TT_ASSERT( - conv_weight_tensor.get_layout() == Layout::ROW_MAJOR && - "Convolution weights should be in row major layout for adding the required padding"); - // Define output tensor shape. This is going to be channel dimension of weight tensor * num_groups - this value // should match number of input channels being convolved with the weight tensor auto original_conv_weight_tensor_shape_test = conv_weight_tensor.get_shape(); @@ -532,52 +413,27 @@ Tensor convert_conv_weight_tensor_to_grouped_layout( original_conv_weight_tensor_shape[2], original_conv_weight_tensor_shape[3]}; - // Create newly allocated buffer all initialized to 0 depending on the datatype of the weight tensor - if (output_dtype == DataType::INT32) { - return conv_group_weight_zero_pad_helper( - conv_weight_tensor, - original_conv_weight_tensor_shape, - output_conv_weight_tensor_shape, - num_groups, - output_dtype); - } else if (output_dtype == DataType::FLOAT32) { - return conv_group_weight_zero_pad_helper( - conv_weight_tensor, - original_conv_weight_tensor_shape, - output_conv_weight_tensor_shape, - num_groups, - output_dtype); - } else if (output_dtype == DataType::BFLOAT16) { - return conv_group_weight_zero_pad_helper( - conv_weight_tensor, - original_conv_weight_tensor_shape, - output_conv_weight_tensor_shape, - num_groups, - output_dtype); - } else if (output_dtype == DataType::UINT16) { - return conv_group_weight_zero_pad_helper( - conv_weight_tensor, - original_conv_weight_tensor_shape, - output_conv_weight_tensor_shape, - num_groups, - output_dtype); - } else if (output_dtype == DataType::BFLOAT8_B) { - return conv_group_weight_zero_pad_helper( - conv_weight_tensor, - original_conv_weight_tensor_shape, - output_conv_weight_tensor_shape, - num_groups, - DataType::FLOAT32); - } else { - return conv_group_weight_zero_pad_helper( - conv_weight_tensor, - original_conv_weight_tensor_shape, - output_conv_weight_tensor_shape, - num_groups, - output_dtype); - } - - TT_THROW("Unsupported weight data type given when trying to add zero padding to weight tensor"); + const static std::unordered_map< + DataType, + std::function> + to_w_tile_layout_map = { + {DataType::INT32, &conv_group_weight_zero_pad_helper}, + {DataType::FLOAT32, &conv_group_weight_zero_pad_helper}, + {DataType::BFLOAT16, &conv_group_weight_zero_pad_helper}, + {DataType::UINT16, &conv_group_weight_zero_pad_helper}, + {DataType::BFLOAT8_B, &conv_group_weight_zero_pad_helper}, + {DataType::UINT32, &conv_group_weight_zero_pad_helper}, + {DataType::BFLOAT4_B, &conv_group_weight_zero_pad_helper}, + }; + output_dtype = output_dtype == DataType::BFLOAT8_B ? DataType::FLOAT32 : output_dtype; + + return convert_tensor_to_tiled_layout_common( + conv_weight_tensor, + output_dtype, + to_w_tile_layout_map, + original_conv_weight_tensor_shape, + output_conv_weight_tensor_shape, + num_groups); } /* @@ -587,10 +443,7 @@ allocated output tensor with shape [out_channels, act_block_h, H, W] The extra c from the original weight tensor - it would be convolving act_block in conv_matrix in one go */ Tensor convert_conv_weight_tensor_to_depthwise_layout( - Tensor conv_weight_tensor, uint32_t act_block_h_ntiles, DataType output_dtype) { - TT_ASSERT( - conv_weight_tensor.get_layout() == Layout::ROW_MAJOR && - "Convolution weights should be in row major layout for repeating the required dimensions"); + const Tensor& conv_weight_tensor, uint32_t act_block_h_ntiles, DataType output_dtype) { auto original_conv_weight_tensor_shape_test = conv_weight_tensor.get_shape(); uint32_t num_input_channels_to_repeat = act_block_h_ntiles * constants::TILE_HEIGHT; ttnn::SimpleShape original_conv_weight_tensor_shape{ @@ -605,27 +458,26 @@ Tensor convert_conv_weight_tensor_to_depthwise_layout( original_conv_weight_tensor_shape[3]}; // Create newly allocated buffer all initialized to 0 depending on the datatype of the weight tensor - if (output_dtype == DataType::INT32) { - return conv_depthwise_weight_bcast_helper( - conv_weight_tensor, original_conv_weight_tensor_shape, output_conv_weight_tensor_shape, output_dtype); - } else if (output_dtype == DataType::FLOAT32) { - return conv_depthwise_weight_bcast_helper( - conv_weight_tensor, original_conv_weight_tensor_shape, output_conv_weight_tensor_shape, output_dtype); - } else if (output_dtype == DataType::BFLOAT16) { - return conv_depthwise_weight_bcast_helper( - conv_weight_tensor, original_conv_weight_tensor_shape, output_conv_weight_tensor_shape, output_dtype); - } else if (output_dtype == DataType::UINT16) { - return conv_depthwise_weight_bcast_helper( - conv_weight_tensor, original_conv_weight_tensor_shape, output_conv_weight_tensor_shape, output_dtype); - } else if (output_dtype == DataType::BFLOAT8_B) { - return conv_depthwise_weight_bcast_helper( - conv_weight_tensor, original_conv_weight_tensor_shape, output_conv_weight_tensor_shape, DataType::FLOAT32); - } else { - return conv_depthwise_weight_bcast_helper( - conv_weight_tensor, original_conv_weight_tensor_shape, output_conv_weight_tensor_shape, DataType::FLOAT32); - } - - TT_THROW("Unsupported weight data type given when trying to add zero padding to weight tensor"); + const static std:: + unordered_map> + to_w_tile_layout_map = { + {DataType::INT32, &conv_depthwise_weight_bcast_helper}, + {DataType::FLOAT32, &conv_depthwise_weight_bcast_helper}, + {DataType::BFLOAT16, &conv_depthwise_weight_bcast_helper}, + {DataType::UINT16, &conv_depthwise_weight_bcast_helper}, + {DataType::BFLOAT8_B, &conv_depthwise_weight_bcast_helper}, + {DataType::UINT32, &conv_depthwise_weight_bcast_helper}, + {DataType::BFLOAT4_B, &conv_depthwise_weight_bcast_helper}, + }; + output_dtype = ((output_dtype == DataType::BFLOAT8_B) || (output_dtype == DataType::BFLOAT4_B)) ? DataType::FLOAT32 + : output_dtype; + + return convert_tensor_to_tiled_layout_common( + conv_weight_tensor, + output_dtype, + to_w_tile_layout_map, + original_conv_weight_tensor_shape, + output_conv_weight_tensor_shape); } const ttnn::SimpleShape infer_dims_for_reshape(const Tensor& tensor, tt::stl::Span shape) { diff --git a/ttnn/cpp/ttnn/tensor/tensor_utils.hpp b/ttnn/cpp/ttnn/tensor/tensor_utils.hpp index 3c2565299b96..f4c9b1ae5372 100644 --- a/ttnn/cpp/ttnn/tensor/tensor_utils.hpp +++ b/ttnn/cpp/ttnn/tensor/tensor_utils.hpp @@ -44,7 +44,7 @@ Tensor convert_conv_weight_tensor_to_grouped_layout( // Converts convolution weights to depthwise layout with broadcasted weights Tensor convert_conv_weight_tensor_to_depthwise_layout( - Tensor conv_weight_tensor, uint32_t act_block_h_ntiles, DataType output_dtype); + const Tensor& conv_weight_tensor, uint32_t act_block_h_ntiles, DataType output_dtype); const ttnn::SimpleShape infer_dims_for_reshape(const Tensor& tensor, tt::stl::Span shape); From 8e49222f313fcdcbe088570c806ad86a8abf53f3 Mon Sep 17 00:00:00 2001 From: Martin Chang Date: Wed, 11 Dec 2024 17:25:34 +0800 Subject: [PATCH 49/59] Optimize bfloat16 construction (#15823) --- tt_metal/common/bfloat16.hpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/tt_metal/common/bfloat16.hpp b/tt_metal/common/bfloat16.hpp index c03a58513cf7..b332a9d2aeea 100644 --- a/tt_metal/common/bfloat16.hpp +++ b/tt_metal/common/bfloat16.hpp @@ -18,20 +18,14 @@ class bfloat16 { uint16_t uint16_data; public: - static const size_t SIZEOF = 2; - bfloat16() {} + static constexpr size_t SIZEOF = 2; + bfloat16() = default; // create from float: no rounding, just truncate bfloat16(float float_num) { - uint32_t uint32_data; - TT_ASSERT(sizeof float_num == sizeof uint32_data); + static_assert(sizeof float_num == 4, "float must have size 4"); - uint32_data = *reinterpret_cast(&float_num); - // just move upper 16 to lower 16 (truncate) - uint32_data = (uint32_data >> 16); - - // store lower 16 as 16-bit uint - uint16_data = (uint16_t)uint32_data; + uint16_data = (*reinterpret_cast(&float_num)) >> 16; } // store lower 16 as 16-bit uint From 13f512a464da68660172d3be07e5828419dced8f Mon Sep 17 00:00:00 2001 From: Juan Camilo Vega Date: Wed, 11 Dec 2024 10:37:53 -0500 Subject: [PATCH 50/59] #0: adding new reshape sweeps and correcting existing ones (#15866) ### Problem description Adding the new torch sweeps for reshape ### What's changed Added torch sweeps to reshape Added a parser for their new format Updated reshape.reshape test to test for bfloat16 and float32 rather than bfloat8 and bfloat16 ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12269839018 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .github/workflows/ttnn-run-sweeps.yaml | 1 + .../sweeps/data_movement/reshape/reshape.py | 4 +- .../data_movement/view/tt_torch_trace.md | 5156 +++++++++++++++++ .../data_movement/view/view_tt_torch.py | 111 + .../data_movement/reshape_view/reshape.cpp | 2 +- 5 files changed, 5271 insertions(+), 3 deletions(-) create mode 100644 tests/sweep_framework/sweeps/data_movement/view/tt_torch_trace.md create mode 100644 tests/sweep_framework/sweeps/data_movement/view/view_tt_torch.py diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml index 22ef67bb730c..10b7f88d1586 100644 --- a/.github/workflows/ttnn-run-sweeps.yaml +++ b/.github/workflows/ttnn-run-sweeps.yaml @@ -356,6 +356,7 @@ on: - data_movement.squeeze.squeeze_pytorch2 - data_movement.embedding.embedding_pytorch2 - data_movement.view.view_pytorch2 + - data_movement.view.view_tt_torch schedule: - cron: "0 21 * * *" # This cron schedule runs the workflow at 9:00pm UTC nightly diff --git a/tests/sweep_framework/sweeps/data_movement/reshape/reshape.py b/tests/sweep_framework/sweeps/data_movement/reshape/reshape.py index e85d2cd55a06..69d188257e12 100644 --- a/tests/sweep_framework/sweeps/data_movement/reshape/reshape.py +++ b/tests/sweep_framework/sweeps/data_movement/reshape/reshape.py @@ -67,8 +67,8 @@ def gen_reshape_shape(input_shape, step=1): "input_shape": gen_shapes([1, 1, 1, 1], [6, 6, 256, 256], [1, 1, 1, 1], 16) + gen_shapes([1, 1, 1], [6, 256, 256], [1, 1, 1], 16) + gen_shapes([1, 1], [256, 256], [1, 1], 16), - "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b], - "input_a_layout": [ttnn.TILE_LAYOUT], # ttnn.ROW_MAJOR_LAYOUT + "input_a_dtype": [ttnn.bfloat16, ttnn.float32], + "input_a_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT], # ttnn.ROW_MAJOR_LAYOUT "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG], }, } diff --git a/tests/sweep_framework/sweeps/data_movement/view/tt_torch_trace.md b/tests/sweep_framework/sweeps/data_movement/view/tt_torch_trace.md new file mode 100644 index 000000000000..a6da9b66f74b --- /dev/null +++ b/tests/sweep_framework/sweeps/data_movement/view/tt_torch_trace.md @@ -0,0 +1,5156 @@ +# ttnn.reshape + +| Name | Input Shapes | Input Layouts | Attributes | Output Shapes | Output Layouts | Runs on TTNN | PCC | ATOL | +|------|--------------|---------------|------------|---------------|----------------|--------------|-----|------| +| ttnn.reshape | tensor<[1,12,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 1 : i32] | tensor<[1,12,10,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,12,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 1 : i32] | tensor<[1,12,10,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 1 : i32] | tensor<[1,12,197,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,12,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 1 : i32] | tensor<[1,12,197,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 1 : i32] | tensor<[1,12,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,12,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 1 : i32] | tensor<[1,12,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 1 : i32] | tensor<[1,12,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,12,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 1 : i32] | tensor<[1,12,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,201,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 201 : i32, 1 : i32] | tensor<[1,12,201,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,12,201,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 201 : i32, 1 : i32] | tensor<[1,12,201,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,8,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 8 : i32, 1 : i32] | tensor<[1,12,8,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,12,8,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 8 : i32, 1 : i32] | tensor<[1,12,8,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 10 : i32, 1 : i32] | tensor<[1,16,10,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 10 : i32, 1 : i32] | tensor<[1,16,10,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 197 : i32, 1 : i32] | tensor<[1,16,197,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 197 : i32, 1 : i32] | tensor<[1,16,197,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 1 : i32] | tensor<[1,16,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 1 : i32] | tensor<[1,16,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 1 : i32] | tensor<[1,16,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 1 : i32] | tensor<[1,16,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 32 : i32, 1 : i32] | tensor<[1,16,32,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 32 : i32, 1 : i32] | tensor<[1,16,32,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,5,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 5 : i32, 1 : i32] | tensor<[1,16,5,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,5,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 5 : i32, 1 : i32] | tensor<[1,16,5,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 512, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 1 : i32] | tensor<[1,1,16384,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 512, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 1 : i32] | tensor<[1,1,16384,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 600, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 1 : i32] | tensor<[1,1,19200,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 600, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 1 : i32] | tensor<[1,1,19200,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,4096,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2 + d1, d2), memory_config: (1, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 4096 : i32, 1 : i32] | tensor<[1,2,4096,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,2,4096,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2 + d1, d2), memory_config: (1, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 4096 : i32, 1 : i32] | tensor<[1,2,4096,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,4800,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2 + d1, d2), memory_config: (1, 150, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 4800 : i32, 1 : i32] | tensor<[1,2,4800,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,2,4800,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2 + d1, d2), memory_config: (1, 150, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 4800 : i32, 1 : i32] | tensor<[1,2,4800,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1024,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32, 1 : i32] | tensor<[1,5,1024,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,1024,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32, 1 : i32] | tensor<[1,5,1024,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1200,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 38, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1200 : i32, 1 : i32] | tensor<[1,5,1200,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,1200,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 38, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1200 : i32, 1 : i32] | tensor<[1,5,1200,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,15,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 15 : i32, 1 : i32] | tensor<[1,6,15,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,6,15,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 15 : i32, 1 : i32] | tensor<[1,6,15,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 1 : i32] | tensor<[1,6,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,6,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 1 : i32] | tensor<[1,6,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 1 : i32] | tensor<[1,6,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,6,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 1 : i32] | tensor<[1,6,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 10 : i32, 1 : i32] | tensor<[1,8,10,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,8,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 10 : i32, 1 : i32] | tensor<[1,8,10,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 1 : i32] | tensor<[1,8,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,8,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 1 : i32] | tensor<[1,8,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 1 : i32] | tensor<[1,8,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,8,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 1 : i32] | tensor<[1,8,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,2048,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 64, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 2048 : i32, 1 : i32] | tensor<[1,8,2048,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,8,2048,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 64, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 2048 : i32, 1 : i32] | tensor<[1,8,2048,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 1 : i32] | tensor<[1,8,256,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,8,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 1 : i32] | tensor<[1,8,256,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 1 : i32] | tensor<[1,8,256,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,8,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 1 : i32] | tensor<[1,8,256,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,300,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 10, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 300 : i32, 1 : i32] | tensor<[1,8,300,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,8,300,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (1, 10, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 300 : i32, 1 : i32] | tensor<[1,8,300,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,100,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 100 : i32, 1 : i32] | tensor<[8,100,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (25, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[8,100,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 100 : i32, 1 : i32] | tensor<[8,100,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (25, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,100,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 100 : i32, 1 : i32] | tensor<[8,100,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (25, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[8,100,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 100 : i32, 1 : i32] | tensor<[8,100,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (25, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,920,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 29, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 920 : i32, 1 : i32] | tensor<[8,920,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 920 + d1, d2), memory_config: (230, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[8,920,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 29, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 920 : i32, 1 : i32] | tensor<[8,920,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 920 + d1, d2), memory_config: (230, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 1024 : i32] | tensor<[1,10,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[10,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 1024 : i32] | tensor<[1,10,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [100 : i32] | tensor<[100,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[10,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [100 : i32] | tensor<[100,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 2048 : i32] | tensor<[1,10,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[10,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 2048 : i32] | tensor<[1,10,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 3072 : i32] | tensor<[1,10,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[10,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 3072 : i32] | tensor<[1,10,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 4096 : i32] | tensor<[1,10,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[10,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 4096 : i32] | tensor<[1,10,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 512 : i32] | tensor<[1,10,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[10,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 512 : i32] | tensor<[1,10,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 768 : i32] | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[10,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 768 : i32] | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 10 : i32] | tensor<[1,12,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 20.88 | +| ttnn.reshape | tensor<[12,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 10 : i32] | tensor<[1,12,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 64 : i32] | tensor<[1,12,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 64 : i32] | tensor<[1,12,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 197 : i32] | tensor<[1,12,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.21 | 15104.0 | +| ttnn.reshape | tensor<[12,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 197 : i32] | tensor<[1,12,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,197,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 64 : i32] | tensor<[1,12,197,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,197,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 64 : i32] | tensor<[1,12,197,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 10 : i32] | tensor<[1,12,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 7.28 | +| ttnn.reshape | tensor<[12,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 10 : i32] | tensor<[1,12,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 1 : i32] | tensor<[1,12,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.27 | 13.5 | +| ttnn.reshape | tensor<[12,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 1 : i32] | tensor<[1,12,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 64 : i32] | tensor<[1,12,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 64 : i32] | tensor<[1,12,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,201,201,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (76, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 201 : i32, 201 : i32] | tensor<[1,12,201,201,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.64 | 684.0 | +| ttnn.reshape | tensor<[12,201,201,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (76, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 201 : i32, 201 : i32] | tensor<[1,12,201,201,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,201,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 201 : i32, 64 : i32] | tensor<[1,12,201,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,201,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 201 : i32, 64 : i32] | tensor<[1,12,201,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,8,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 8 : i32, 64 : i32] | tensor<[1,12,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,8,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 8 : i32, 64 : i32] | tensor<[1,12,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,8,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 8 : i32, 8 : i32] | tensor<[1,12,8,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.08 | 86.0 | +| ttnn.reshape | tensor<[12,8,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 8 : i32, 8 : i32] | tensor<[1,12,8,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[15,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 1024 : i32] | tensor<[1,15,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[15,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 1024 : i32] | tensor<[1,15,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[15,384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 384 : i32] | tensor<[1,15,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[15,384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 384 : i32] | tensor<[1,15,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[15,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 512 : i32] | tensor<[1,15,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[15,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 512 : i32] | tensor<[1,15,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 10 : i32, 10 : i32] | tensor<[1,16,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.02 | 949978046398464.0 | +| ttnn.reshape | tensor<[16,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 10 : i32, 10 : i32] | tensor<[1,16,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 10 : i32, 64 : i32] | tensor<[1,16,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 10 : i32, 64 : i32] | tensor<[1,16,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 197 : i32, 197 : i32] | tensor<[1,16,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | inf | +| ttnn.reshape | tensor<[16,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 197 : i32, 197 : i32] | tensor<[1,16,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,197,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 197 : i32, 64 : i32] | tensor<[1,16,197,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,197,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 197 : i32, 64 : i32] | tensor<[1,16,197,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 10 : i32] | tensor<[1,16,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.3 | 8.5 | +| ttnn.reshape | tensor<[16,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 10 : i32] | tensor<[1,16,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 1 : i32] | tensor<[1,16,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.09 | 6.69 | +| ttnn.reshape | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 1 : i32] | tensor<[1,16,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 64 : i32] | tensor<[1,16,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 64 : i32] | tensor<[1,16,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,5,5,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 5 : i32, 5 : i32] | tensor<[1,16,5,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,5,5,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 5 : i32, 5 : i32] | tensor<[1,16,5,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,5,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 5 : i32, 64 : i32] | tensor<[1,16,5,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,5,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 5 : i32, 64 : i32] | tensor<[1,16,5,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 1024 : i32] | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[197,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 1024 : i32] | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 768 : i32] | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[197,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 768 : i32] | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,256008,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 8001, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 256008 : i32] | tensor<[1,19,256008,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 8001, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[19,256008,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 8001, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 256008 : i32] | tensor<[1,19,256008,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 8001, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32] | tensor<[1,1,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32] | tensor<[1,1,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,10,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 600 + d1 * 60 + d2 * 6 + d3, d4), memory_config: (19, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 600 : i32, 4 : i32] | tensor<[1,600,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 600 + d1, d2), memory_config: (19, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,10,10,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 600 + d1 * 60 + d2 * 6 + d3, d4), memory_config: (19, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 600 : i32, 4 : i32] | tensor<[1,600,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 600 + d1, d2), memory_config: (19, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,10,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 600 + d1 * 60 + d2 * 6 + d3, d4), memory_config: (19, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 600 : i32, 91 : i32] | tensor<[1,600,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 600 + d1, d2), memory_config: (19, 3, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,10,10,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 600 + d1 * 60 + d2 * 6 + d3, d4), memory_config: (19, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 600 : i32, 91 : i32] | tensor<[1,600,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 600 + d1, d2), memory_config: (19, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 10 : i32, 10 : i32] | tensor<[12,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 32.0 | +| ttnn.reshape | tensor<[1,12,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 10 : i32, 10 : i32] | tensor<[12,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 10 : i32] | tensor<[12,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 7.28 | +| ttnn.reshape | tensor<[1,12,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 10 : i32] | tensor<[12,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 1 : i32] | tensor<[12,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.07 | 17.75 | +| ttnn.reshape | tensor<[1,12,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 1 : i32] | tensor<[12,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 256 : i32] | tensor<[1,1,16384,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16384,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 256 : i32] | tensor<[1,1,16384,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 256 : i32] | tensor<[1,16384,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16384,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 256 : i32] | tensor<[1,16384,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 32 : i32] | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 32 : i32] | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 10 : i32, 10 : i32] | tensor<[16,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 53.75 | +| ttnn.reshape | tensor<[1,16,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 10 : i32, 10 : i32] | tensor<[16,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,16,16,3,bf16]> | mapping_from: (d0, d1, d2, d3, d4, d5), mapping_to: (d0 * 65536 + d1 * 4096 + d2 * 256 + d3 * 16 + d4, d5), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 768 : i32] | tensor<[1,256,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,16,16,16,3,bf16]> | mapping_from: (d0, d1, d2, d3, d4, d5), mapping_to: (d0 * 65536 + d1 * 4096 + d2 * 256 + d3 * 16 + d4, d5), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 768 : i32] | tensor<[1,256,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 10 : i32] | tensor<[16,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.3 | 8.5 | +| ttnn.reshape | tensor<[1,16,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 10 : i32] | tensor<[16,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 1 : i32] | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 11.94 | +| ttnn.reshape | tensor<[1,16,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 1 : i32] | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,5,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 5 : i32, 5 : i32] | tensor<[16,5,5,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,5,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 5 : i32, 5 : i32] | tensor<[16,5,5,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19200,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 300 : i32] | tensor<[1,1,19200,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.19 | 208.0 | +| ttnn.reshape | tensor<[1,19200,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 300 : i32] | tensor<[1,1,19200,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 64 : i32] | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 64 : i32] | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 64 : i32] | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 64 : i32] | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 16 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 1024 : i32] | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 16 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 1024 : i32] | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 2048 : i32] | tensor<[1,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 2048 : i32] | tensor<[1,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 2400 + d1 * 120 + d2 * 6 + d3, d4), memory_config: (75, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2400 : i32, 4 : i32] | tensor<[1,2400,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2400 + d1, d2), memory_config: (75, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,20,20,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 2400 + d1 * 120 + d2 * 6 + d3, d4), memory_config: (75, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2400 : i32, 4 : i32] | tensor<[1,2400,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2400 + d1, d2), memory_config: (75, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 2400 + d1 * 120 + d2 * 6 + d3, d4), memory_config: (75, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2400 : i32, 91 : i32] | tensor<[1,2400,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2400 + d1, d2), memory_config: (75, 3, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,20,20,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 2400 + d1 * 120 + d2 * 6 + d3, d4), memory_config: (75, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2400 : i32, 91 : i32] | tensor<[1,2400,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2400 + d1, d2), memory_config: (75, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,2,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 24 + d1 * 12 + d2 * 6 + d3, d4), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 24 : i32, 4 : i32] | tensor<[1,24,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 24 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,2,2,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 24 + d1 * 12 + d2 * 6 + d3, d4), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 24 : i32, 4 : i32] | tensor<[1,24,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 24 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,2,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 24 + d1 * 12 + d2 * 6 + d3, d4), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 24 : i32, 91 : i32] | tensor<[1,24,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 24 + d1, d2), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,2,2,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 24 + d1 * 12 + d2 * 6 + d3, d4), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 24 : i32, 91 : i32] | tensor<[1,24,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 24 + d1, d2), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 512 : i32] | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 512 : i32] | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3072 : i32] | tensor<[1,1,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3072 : i32] | tensor<[1,1,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1004, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 32128 : i32] | tensor<[1,1,32128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1004, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,32128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1004, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 32128 : i32] | tensor<[1,1,32128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1004, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,16,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 16 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 1536 : i32] | tensor<[1,32,1536,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,16,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 16 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 1536 : i32] | tensor<[1,32,1536,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 7 : i32] | tensor<[1,32,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 7 : i32] | tensor<[1,32,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 384 : i32] | tensor<[1,1,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 384 : i32] | tensor<[1,1,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,3,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 54 + d1 * 18 + d2 * 6 + d3, d4), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 54 : i32, 4 : i32] | tensor<[1,54,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 54 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,3,3,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 54 + d1 * 18 + d2 * 6 + d3, d4), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 54 : i32, 4 : i32] | tensor<[1,54,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 54 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,3,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 54 + d1 * 18 + d2 * 6 + d3, d4), memory_config: (2, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 54 : i32, 91 : i32] | tensor<[1,54,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 54 + d1, d2), memory_config: (2, 3, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,3,3,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 54 + d1 * 18 + d2 * 6 + d3, d4), memory_config: (2, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 54 : i32, 91 : i32] | tensor<[1,54,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 54 + d1, d2), memory_config: (2, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32] | tensor<[1,1,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32] | tensor<[1,1,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 256 : i32] | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 256 : i32] | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 64 : i32] | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 64 : i32] | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 128 : i32] | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 128 : i32] | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 512 : i32] | tensor<[1,1,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 512 : i32] | tensor<[1,1,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,4,4,64,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 80 + d1 * 16 + d2 * 4 + d3, d4), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 16 : i32, 64 : i32] | tensor<[1,5,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 16 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,4,4,64,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 80 + d1 * 16 + d2 * 4 + d3, d4), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 16 : i32, 64 : i32] | tensor<[1,5,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 16 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 150 + d1 * 30 + d2 * 6 + d3, d4), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 150 : i32, 4 : i32] | tensor<[1,150,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 150 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,5,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 150 + d1 * 30 + d2 * 6 + d3, d4), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 150 : i32, 4 : i32] | tensor<[1,150,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 150 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 150 + d1 * 30 + d2 * 6 + d3, d4), memory_config: (5, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 150 : i32, 91 : i32] | tensor<[1,150,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 150 + d1, d2), memory_config: (5, 3, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,5,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 150 + d1 * 30 + d2 * 6 + d3, d4), memory_config: (5, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 150 : i32, 91 : i32] | tensor<[1,150,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 150 + d1, d2), memory_config: (5, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 32 : i32] | tensor<[1,64,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 32 : i32] | tensor<[1,64,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,15,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 15 : i32, 15 : i32] | tensor<[6,15,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 25.25 | +| ttnn.reshape | tensor<[1,6,15,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 15 : i32, 15 : i32] | tensor<[6,15,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 15 : i32] | tensor<[6,1,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.38 | 9.0 | +| ttnn.reshape | tensor<[1,6,1,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 15 : i32] | tensor<[6,1,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 1 : i32] | tensor<[6,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.12 | 25.12 | +| ttnn.reshape | tensor<[1,6,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 1 : i32] | tensor<[6,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 768 : i32] | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 768 : i32] | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,4544,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 4544 : i32] | tensor<[7,4544,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,7,4544,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 4544 : i32] | tensor<[7,4544,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,71,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 71 + d2, d3), memory_config: (16, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 4544 : i32] | tensor<[1,7,4544,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,7,71,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 71 + d2, d3), memory_config: (16, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 4544 : i32] | tensor<[1,7,4544,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 10 : i32, 10 : i32] | tensor<[8,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 18.5 | +| ttnn.reshape | tensor<[1,8,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 10 : i32, 10 : i32] | tensor<[8,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 10 : i32] | tensor<[8,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 10.25 | +| ttnn.reshape | tensor<[1,8,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 10 : i32] | tensor<[8,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 1 : i32] | tensor<[8,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.1 | 8.0 | +| ttnn.reshape | tensor<[1,8,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 1 : i32] | tensor<[8,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[20,20,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [400 : i32] | tensor<[400,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 13, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[20,20,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [400 : i32] | tensor<[400,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 13, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1280 : i32] | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1280 : i32] | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [4 : i32] | tensor<[4,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[2,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [4 : i32] | tensor<[4,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (256, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2 : i32, 4096 : i32, 256 : i32] | tensor<[1,2,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (256, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2 : i32, 4096 : i32, 256 : i32] | tensor<[1,2,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,4096,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2 : i32, 4096 : i32, 32 : i32] | tensor<[1,2,4096,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,4096,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2 : i32, 4096 : i32, 32 : i32] | tensor<[1,2,4096,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,4800,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (300, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2 : i32, 4800 : i32, 300 : i32] | tensor<[1,2,4800,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.78 | 103.5 | +| ttnn.reshape | tensor<[2,4800,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (300, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2 : i32, 4800 : i32, 300 : i32] | tensor<[1,2,4800,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,4800,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2 : i32, 4800 : i32, 64 : i32] | tensor<[1,2,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,4800,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2 : i32, 4800 : i32, 64 : i32] | tensor<[1,2,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,8,64,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 7 : i32] | tensor<[16,64,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,8,64,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 7 : i32] | tensor<[16,64,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,8,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 7 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 7 : i32, 64 : i32] | tensor<[16,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,8,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 7 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 7 : i32, 64 : i32] | tensor<[16,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,11008,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 344, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 11008 : i32] | tensor<[1,32,11008,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 344, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,11008,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 344, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 11008 : i32] | tensor<[1,32,11008,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 344, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,250880,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7840, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 250880 : i32] | tensor<[1,32,250880,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 7840, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[32,250880,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7840, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 250880 : i32] | tensor<[1,32,250880,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 7840, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,32000,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1000, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32000 : i32] | tensor<[1,32,32000,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1000, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,32000,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1000, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32000 : i32] | tensor<[1,32,32000,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1000, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 4096 : i32] | tensor<[1,32,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 4096 : i32] | tensor<[1,32,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,3,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [9 : i32] | tensor<[9,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[3,3,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [9 : i32] | tensor<[9,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4096,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 320 : i32] | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[4096,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 320 : i32] | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32] | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[5,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32] | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (160, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32, 256 : i32] | tensor<[1,5,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[5,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (160, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32, 256 : i32] | tensor<[1,5,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,1024,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32, 32 : i32] | tensor<[1,5,1024,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[5,1024,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32, 32 : i32] | tensor<[1,5,1024,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,1200,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (188, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1200 : i32, 300 : i32] | tensor<[1,5,1200,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.13 | 70.0 | +| ttnn.reshape | tensor<[5,1200,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (188, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1200 : i32, 300 : i32] | tensor<[1,5,1200,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,1200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1200 : i32, 64 : i32] | tensor<[1,5,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[5,1200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1200 : i32, 64 : i32] | tensor<[1,5,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 3072 : i32] | tensor<[1,5,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[5,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 3072 : i32] | tensor<[1,5,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,5,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [25 : i32] | tensor<[25,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[5,5,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [25 : i32] | tensor<[25,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[600,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 100 : i32, 256 : i32] | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[600,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 100 : i32, 256 : i32] | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[600,4,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 100 : i32, 4 : i32] | tensor<[6,1,100,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 16.25 | +| ttnn.reshape | tensor<[600,4,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 100 : i32, 4 : i32] | tensor<[6,1,100,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1280 : i32] | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[64,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1280 : i32] | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,100,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 100 : i32, 256 : i32] | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,100,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 100 : i32, 256 : i32] | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,100,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 3, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 100 : i32, 92 : i32] | tensor<[6,1,100,92,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 27.25 | +| ttnn.reshape | tensor<[6,100,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 3, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 100 : i32, 92 : i32] | tensor<[6,1,100,92,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1024 : i32] | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1024 : i32] | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,15,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 15 : i32, 15 : i32] | tensor<[1,6,15,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 17.5 | +| ttnn.reshape | tensor<[6,15,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 15 : i32, 15 : i32] | tensor<[1,6,15,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,15,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 15 : i32, 64 : i32] | tensor<[1,6,15,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,15,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 15 : i32, 64 : i32] | tensor<[1,6,15,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 100 : i32, 256 : i32] | tensor<[6,100,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 100 : i32, 256 : i32] | tensor<[6,100,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [600 : i32, 256 : i32] | tensor<[600,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [600 : i32, 256 : i32] | tensor<[600,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,100,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 1, 'tile<32x32, bf16>', 'dram') | shape: [600 : i32, 4 : i32] | tensor<[600,4,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.06 | 16.25 | +| ttnn.reshape | tensor<[6,1,100,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 1, 'tile<32x32, bf16>', 'dram') | shape: [600 : i32, 4 : i32] | tensor<[600,4,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,100,92,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 3, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 100 : i32, 92 : i32] | tensor<[6,100,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 28.0 | +| ttnn.reshape | tensor<[6,1,100,92,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 3, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 100 : i32, 92 : i32] | tensor<[6,100,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 15 : i32] | tensor<[1,6,1,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.38 | 9.0 | +| ttnn.reshape | tensor<[6,1,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 15 : i32] | tensor<[1,6,1,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 1 : i32] | tensor<[1,6,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.44 | 1.02 | +| ttnn.reshape | tensor<[6,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 1 : i32] | tensor<[1,6,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 64 : i32] | tensor<[1,6,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 64 : i32] | tensor<[1,6,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,50272,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1571, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 50272 : i32] | tensor<[1,6,50272,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1571, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[6,50272,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1571, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 50272 : i32] | tensor<[1,6,50272,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1571, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 512 : i32] | tensor<[1,6,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 512 : i32] | tensor<[1,6,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[71,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 71 : i32, 7 : i32, 64 : i32] | tensor<[1,71,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[71,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 71 : i32, 7 : i32, 64 : i32] | tensor<[1,71,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[71,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 71 : i32, 7 : i32, 7 : i32] | tensor<[1,71,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[71,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 71 : i32, 7 : i32, 7 : i32] | tensor<[1,71,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,18176,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 568, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 18176 : i32] | tensor<[1,7,18176,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 568, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[7,18176,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 568, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 18176 : i32] | tensor<[1,7,18176,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 568, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 2 : i32] | tensor<[1,7,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.13 | 3.94 | +| ttnn.reshape | tensor<[7,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 2 : i32] | tensor<[1,7,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,4544,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 4544 : i32] | tensor<[1,7,4544,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[7,4544,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 4544 : i32] | tensor<[1,7,4544,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,4672,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 146, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 4672 : i32] | tensor<[1,7,4672,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 146, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[7,4672,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 146, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 4672 : i32] | tensor<[1,7,4672,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 146, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,65024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 2032, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 65024 : i32] | tensor<[1,7,65024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 2032, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[7,65024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 2032, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 65024 : i32] | tensor<[1,7,65024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 2032, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 10 : i32, 10 : i32] | tensor<[1,8,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 13.81 | +| ttnn.reshape | tensor<[8,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 10 : i32, 10 : i32] | tensor<[1,8,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 10 : i32, 64 : i32] | tensor<[1,8,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 10 : i32, 64 : i32] | tensor<[1,8,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 10 : i32] | tensor<[1,8,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 10.25 | +| ttnn.reshape | tensor<[8,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 10 : i32] | tensor<[1,8,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 1 : i32] | tensor<[1,8,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.4 | 6.0 | +| ttnn.reshape | tensor<[8,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 1 : i32] | tensor<[1,8,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 64 : i32] | tensor<[1,8,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 64 : i32] | tensor<[1,8,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,2048,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 2048 : i32, 256 : i32] | tensor<[1,8,2048,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,2048,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 2048 : i32, 256 : i32] | tensor<[1,8,2048,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,2048,96,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 2048 : i32, 96 : i32] | tensor<[1,8,2048,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 3, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,2048,96,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 2048 : i32, 96 : i32] | tensor<[1,8,2048,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,256,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 160 : i32] | tensor<[1,8,256,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,256,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 160 : i32] | tensor<[1,8,256,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,256,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 2048 : i32] | tensor<[1,8,256,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,256,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 2048 : i32] | tensor<[1,8,256,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 256 : i32] | tensor<[1,8,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 256 : i32] | tensor<[1,8,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 32 : i32] | tensor<[1,8,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 32 : i32] | tensor<[1,8,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,300,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (75, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 300 : i32, 300 : i32] | tensor<[1,8,300,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.27 | 173.0 | +| ttnn.reshape | tensor<[8,300,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (75, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 300 : i32, 300 : i32] | tensor<[1,8,300,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 300 : i32, 64 : i32] | tensor<[1,8,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 300 : i32, 64 : i32] | tensor<[1,8,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 1 : i32, 256 : i32] | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 1 : i32, 256 : i32] | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 1280 : i32] | tensor<[1,9,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 1280 : i32] | tensor<[1,9,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 320 : i32] | tensor<[1,9,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 320 : i32] | tensor<[1,9,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 640 : i32] | tensor<[1,9,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 640 : i32] | tensor<[1,9,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 1024 : i32] | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 1.744611744467702e+37 | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 1.744611744467702e+37 | +| ttnn.reshape | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 1024 : i32] | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 1024 : i32] | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,49,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 2048 : i32] | tensor<[1,7,7,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 1024 : i32] | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 2048 : i32] | tensor<[1,7,7,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 1024 : i32] | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 1.1564283563328768e+38 | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 1.1564283563328768e+38 | +| ttnn.reshape | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 1024 : i32] | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 1024 : i32] | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.06 | 15.19 | +| ttnn.reshape | tensor<[1,1,196,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 512 : i32] | tensor<[1,14,14,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.06 | 15.19 | +| ttnn.reshape | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 1024 : i32] | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 512 : i32] | tensor<[1,14,14,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1024 : i32] | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 13.25 | +| ttnn.reshape | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1024 : i32] | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 13.25 | +| ttnn.reshape | tensor<[1024,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 1 : i32, 1 : i32] | tensor<[1024,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 13.25 | +| ttnn.reshape | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1024 : i32] | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1024 : i32] | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1024,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 1 : i32, 1 : i32] | tensor<[1024,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1024 : i32] | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,256,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 255 : i32] | tensor<[1,16,16,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[255,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [255 : i32, 1 : i32, 1 : i32] | tensor<[255,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1024 : i32] | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 255 : i32] | tensor<[1,16,16,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[255,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [255 : i32, 1 : i32, 1 : i32] | tensor<[255,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1024 : i32] | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.39 | 4.69 | +| ttnn.reshape | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 512 : i32] | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.39 | 4.69 | +| ttnn.reshape | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1024 : i32] | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 512 : i32] | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 1024 : i32] | tensor<[1,1,784,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 32, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 0.04 | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 0.04 | +| ttnn.reshape | tensor<[1,28,28,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 1024 : i32] | tensor<[1,1,784,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 1024 : i32] | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.82 | 0.43 | +| ttnn.reshape | tensor<[1,1,920,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 2048 : i32] | tensor<[1,23,40,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | yes | 0.82 | 0.43 | +| ttnn.reshape | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 1024 : i32] | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,920,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 2048 : i32] | tensor<[1,23,40,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 1024 : i32] | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.96 | 1.19 | +| ttnn.reshape | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 256 : i32] | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.96 | 1.19 | +| ttnn.reshape | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 1024 : i32] | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 256 : i32] | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 1024 : i32] | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.81 | 0.87 | +| ttnn.reshape | tensor<[1,1,3600,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 512 : i32] | tensor<[1,45,80,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.81 | 0.87 | +| ttnn.reshape | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 1024 : i32] | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3600,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 512 : i32] | tensor<[1,45,80,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,102,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 102 : i32] | tensor<[1,1,3136,102,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 6.72 | +| ttnn.reshape | tensor<[1,1,3136,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 40 : i32] | tensor<[1,56,56,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 6.72 | +| ttnn.reshape | tensor<[1,56,56,102,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 102 : i32] | tensor<[1,1,3136,102,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 40 : i32] | tensor<[1,56,56,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,1072,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 34, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 1072 : i32] | tensor<[1,1,49,1072,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 34, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 1.0633823966279325e+38 | +| ttnn.reshape | tensor<[1,1,49,462,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 15, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 462 : i32] | tensor<[1,7,7,462,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 15, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 1.0633823966279325e+38 | +| ttnn.reshape | tensor<[1,7,7,1072,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 34, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 1072 : i32] | tensor<[1,1,49,1072,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 34, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,462,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 15, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 462 : i32] | tensor<[1,7,7,462,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 15, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 112 : i32] | tensor<[1,1,400,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 672 : i32] | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,20,20,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 112 : i32] | tensor<[1,1,400,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 672 : i32] | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 116 : i32] | tensor<[1,1,196,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.36 | 2.62 | +| ttnn.reshape | tensor<[1,1,196,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 40 : i32] | tensor<[1,14,14,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.36 | 2.62 | +| ttnn.reshape | tensor<[1,14,14,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 116 : i32] | tensor<[1,1,196,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 40 : i32] | tensor<[1,14,14,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,118,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 118 : i32] | tensor<[1,1,784,118,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 1.41 | +| ttnn.reshape | tensor<[1,1,784,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 34 : i32] | tensor<[1,28,28,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 1.41 | +| ttnn.reshape | tensor<[1,28,28,118,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 118 : i32] | tensor<[1,1,784,118,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 34 : i32] | tensor<[1,28,28,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 120 : i32] | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 120 : i32] | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 120 : i32] | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,1,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 480 : i32] | tensor<[1,1,1,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[480,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [480 : i32, 1 : i32, 1 : i32] | tensor<[480,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 120 : i32] | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 480 : i32] | tensor<[1,1,1,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[480,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [480 : i32, 1 : i32, 1 : i32] | tensor<[480,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,40,40,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 120 : i32] | tensor<[1,1,1600,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[1,1,1600,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 120 : i32] | tensor<[1,40,40,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[1,40,40,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 120 : i32] | tensor<[1,1,1600,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1600,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 120 : i32] | tensor<[1,40,40,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,40,40,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 120 : i32] | tensor<[1,1,1600,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1600,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 40 : i32] | tensor<[1,40,40,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,40,40,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 120 : i32] | tensor<[1,1,1600,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1600,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 40 : i32] | tensor<[1,40,40,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,122,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 122 : i32] | tensor<[1,1,784,122,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.31 | 2.28 | +| ttnn.reshape | tensor<[1,1,784,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 46 : i32] | tensor<[1,28,28,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.31 | 2.28 | +| ttnn.reshape | tensor<[1,28,28,122,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 122 : i32] | tensor<[1,1,784,122,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 46 : i32] | tensor<[1,28,28,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,124,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 124 : i32] | tensor<[1,1,3136,124,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,124,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 124 : i32] | tensor<[1,1,3136,124,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1280 : i32] | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.53 | 9.88 | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.53 | 9.88 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.53 | 9.88 | +| ttnn.reshape | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1280 : i32] | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1280 : i32] | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.52 | 3.69 | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.52 | 3.69 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.52 | 3.69 | +| ttnn.reshape | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1280 : i32] | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1280 : i32] | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 21.5 | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 21.5 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 21.5 | +| ttnn.reshape | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1280 : i32] | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,30,40,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 1280 : i32] | tensor<[1,1,1200,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1200,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 1280 : i32] | tensor<[1,30,40,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,30,40,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 1280 : i32] | tensor<[1,1,1200,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1200,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 1280 : i32] | tensor<[1,30,40,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1280 : i32] | tensor<[1,1,1024,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 14.25 | +| ttnn.reshape | tensor<[1,1,1024,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 1280 : i32] | tensor<[1,32,32,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 14.25 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 14.25 | +| ttnn.reshape | tensor<[1,32,32,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1280 : i32] | tensor<[1,1,1024,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 1280 : i32] | tensor<[1,32,32,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1280 : i32] | tensor<[1,1,1024,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.66 | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.66 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.66 | +| ttnn.reshape | tensor<[1,32,32,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1280 : i32] | tensor<[1,1,1024,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1280 : i32] | tensor<[1,1,1024,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,32,32,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1280 : i32] | tensor<[1,1,1024,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 64 : i32, 1280 : i32] | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 1.1497822163539522e+38 | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 1.1497822163539522e+38 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 1.1497822163539522e+38 | +| ttnn.reshape | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 64 : i32, 1280 : i32] | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 64 : i32, 1280 : i32] | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 1.0966130965225556e+38 | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 1.0966130965225556e+38 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 1.0966130965225556e+38 | +| ttnn.reshape | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 64 : i32, 1280 : i32] | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 128 : i32] | tensor<[1,1,12544,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.88 | 0.13 | +| ttnn.reshape | tensor<[1,1,12544,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 128 : i32] | tensor<[1,112,112,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.88 | 0.13 | +| ttnn.reshape | tensor<[1,112,112,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 128 : i32] | tensor<[1,1,12544,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 128 : i32] | tensor<[1,112,112,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,120,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 128 : i32] | tensor<[1,1,19200,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.38 | +| ttnn.reshape | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 64 : i32] | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.38 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.38 | +| ttnn.reshape | tensor<[1,120,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 128 : i32] | tensor<[1,1,19200,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 64 : i32] | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 128 : i32] | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 16.5 | +| ttnn.reshape | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 128 : i32] | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 16.5 | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 16.5 | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 128 : i32] | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 128 : i32] | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 128 : i32] | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.75 | +| ttnn.reshape | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 256 : i32] | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.75 | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 128 : i32] | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 256 : i32] | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 128 : i32] | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.09 | +| ttnn.reshape | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 64 : i32] | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.09 | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 128 : i32] | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 64 : i32] | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 128 : i32] | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 64 : i32] | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 128 : i32] | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 64 : i32] | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,180,320,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 128 : i32] | tensor<[1,1,57600,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 128 : i32] | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,180,320,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 128 : i32] | tensor<[1,1,57600,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 128 : i32] | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 128 : i32] | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 128 : i32] | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 128 : i32] | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 24 : i32] | tensor<[1,1,1,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 128 : i32] | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 24 : i32] | tensor<[1,1,1,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 128 : i32] | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,1,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 546 : i32] | tensor<[1,1,1,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 128 : i32] | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 546 : i32] | tensor<[1,1,1,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 128 : i32] | tensor<[1,1,50176,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 224 : i32, 224 : i32, 64 : i32] | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,224,224,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 128 : i32] | tensor<[1,1,50176,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 224 : i32, 224 : i32, 64 : i32] | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.47 | 1.25 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.47 | 1.25 | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.47 | 1.25 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 3.75 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 3.75 | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 3.75 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 2.4059026723706977e+38 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 2.4059026723706977e+38 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.24 | 3.25 | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.24 | 3.25 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 19 : i32] | tensor<[1,28,28,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[19,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 1 : i32, 1 : i32] | tensor<[19,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 19 : i32] | tensor<[1,28,28,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 1 : i32, 1 : i32] | tensor<[19,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | -0.05 | 18.5 | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.05 | 18.5 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 0.61 | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 0.61 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.21 | 3.0 | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.21 | 3.0 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.14 | 1.02 | +| ttnn.reshape | tensor<[1,1,784,38,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 38 : i32] | tensor<[1,28,28,38,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.14 | 1.02 | +| ttnn.reshape | tensor<[38,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [38 : i32, 1 : i32, 1 : i32] | tensor<[38,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.14 | 1.02 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,38,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 38 : i32] | tensor<[1,28,28,38,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[38,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [38 : i32, 1 : i32, 1 : i32] | tensor<[38,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[512,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [512 : i32, 1 : i32, 1 : i32] | tensor<[512,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[512,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [512 : i32, 1 : i32, 1 : i32] | tensor<[512,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.42 | 15.69 | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.42 | 15.69 | +| ttnn.reshape | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 128 : i32] | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,2,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 128 : i32] | tensor<[1,1,4,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 256 : i32] | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2,2,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 128 : i32] | tensor<[1,1,4,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 256 : i32] | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,30,40,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 128 : i32] | tensor<[1,1,1200,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 4, 'tile<32x32, bf16>', 'dram') | yes | -0.33 | 22.0 | +| ttnn.reshape | tensor<[1,1,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 64 : i32] | tensor<[1,30,40,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.33 | 22.0 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.33 | 22.0 | +| ttnn.reshape | tensor<[1,30,40,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 128 : i32] | tensor<[1,1,1200,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 64 : i32] | tensor<[1,30,40,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 128 : i32] | tensor<[1,1,1024,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,32,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 128 : i32] | tensor<[1,1,1024,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,3,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 128 : i32] | tensor<[1,1,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,4,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 128 : i32] | tensor<[1,2,2,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,3,3,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 128 : i32] | tensor<[1,1,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 128 : i32] | tensor<[1,2,2,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,3,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 128 : i32] | tensor<[1,1,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 256 : i32] | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,3,3,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 128 : i32] | tensor<[1,1,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 256 : i32] | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.23 | 27.38 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.23 | 27.38 | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 0.49 | +| ttnn.reshape | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 256 : i32] | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 0.49 | +| ttnn.reshape | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 128 : i32] | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 256 : i32] | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 128 : i32] | tensor<[1,1,25,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 128 : i32] | tensor<[1,3,3,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,5,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 128 : i32] | tensor<[1,1,25,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 128 : i32] | tensor<[1,3,3,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 128 : i32] | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 51.25 | +| ttnn.reshape | tensor<[1,1,300,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 128 : i32] | tensor<[1,15,20,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 51.25 | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 51.25 | +| ttnn.reshape | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 128 : i32] | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,300,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 128 : i32] | tensor<[1,15,20,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 128 : i32] | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.86 | 87.5 | +| ttnn.reshape | tensor<[1,1,1200,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 320 : i32] | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.86 | 87.5 | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.86 | 87.5 | +| ttnn.reshape | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 128 : i32] | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1200,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 320 : i32] | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 128 : i32] | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.39 | 2.67 | +| ttnn.reshape | tensor<[1,1,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 64 : i32] | tensor<[1,60,80,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.39 | 2.67 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.39 | 2.67 | +| ttnn.reshape | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 128 : i32] | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 64 : i32] | tensor<[1,60,80,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 128 : i32] | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 64 : i32] | tensor<[1,60,80,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 128 : i32] | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 64 : i32] | tensor<[1,60,80,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 128 : i32] | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 128 : i32] | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 128 : i32] | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 256 : i32] | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 128 : i32] | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 256 : i32] | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 128 : i32] | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 128 : i32] | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 128 : i32] | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 128 : i32] | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 128 : i32] | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 512 : i32] | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 128 : i32] | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 512 : i32] | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,142,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 142 : i32] | tensor<[1,1,3136,142,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 68 : i32] | tensor<[1,56,56,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,142,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 142 : i32] | tensor<[1,1,3136,142,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 68 : i32] | tensor<[1,56,56,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 144 : i32] | tensor<[1,1,784,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.35 | 4.31 | +| ttnn.reshape | tensor<[1,1,784,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 28 : i32] | tensor<[1,28,28,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.35 | 4.31 | +| ttnn.reshape | tensor<[1,28,28,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 144 : i32] | tensor<[1,1,784,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 28 : i32] | tensor<[1,28,28,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 144 : i32] | tensor<[1,1,784,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 32 : i32] | tensor<[1,28,28,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 144 : i32] | tensor<[1,1,784,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 32 : i32] | tensor<[1,28,28,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 144 : i32] | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 35.25 | +| ttnn.reshape | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 144 : i32] | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 35.25 | +| ttnn.reshape | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 144 : i32] | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 144 : i32] | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 144 : i32] | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.29 | 37.5 | +| ttnn.reshape | tensor<[1,1,784,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 144 : i32] | tensor<[1,28,28,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.29 | 37.5 | +| ttnn.reshape | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 144 : i32] | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 144 : i32] | tensor<[1,28,28,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 144 : i32] | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.3823971156163123e+38 | +| ttnn.reshape | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 24 : i32] | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.3823971156163123e+38 | +| ttnn.reshape | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 144 : i32] | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 24 : i32] | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,152,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 152 : i32] | tensor<[1,1,784,152,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.36 | 1.52 | +| ttnn.reshape | tensor<[1,1,784,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 58 : i32] | tensor<[1,28,28,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.36 | 1.52 | +| ttnn.reshape | tensor<[1,28,28,152,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 152 : i32] | tensor<[1,1,784,152,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 58 : i32] | tensor<[1,28,28,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,156,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 156 : i32] | tensor<[1,1,196,156,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.37 | 2.45 | +| ttnn.reshape | tensor<[1,1,196,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 68 : i32] | tensor<[1,14,14,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.37 | 2.45 | +| ttnn.reshape | tensor<[1,14,14,156,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 156 : i32] | tensor<[1,1,196,156,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 68 : i32] | tensor<[1,14,14,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1,1024,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 16.75 | +| ttnn.reshape | tensor<[1,1,256,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 160 : i32] | tensor<[1,16,16,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 16.75 | +| ttnn.reshape | tensor<[160,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, bf16>', 'dram') | shape: [160 : i32, 1 : i32, 1 : i32] | tensor<[160,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 16.75 | +| ttnn.reshape | tensor<[1,32,32,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1,1024,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 160 : i32] | tensor<[1,16,16,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[160,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, bf16>', 'dram') | shape: [160 : i32, 1 : i32, 1 : i32] | tensor<[160,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1,1024,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[1,1,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 256 : i32] | tensor<[1,16,16,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[1,32,32,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1,1024,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 256 : i32] | tensor<[1,16,16,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 160 : i32] | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 15.62 | +| ttnn.reshape | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 960 : i32] | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 15.62 | +| ttnn.reshape | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 160 : i32] | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 960 : i32] | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,168,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 168 : i32] | tensor<[1,1,1,168,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,1,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 672 : i32] | tensor<[1,1,1,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[672,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [672 : i32, 1 : i32, 1 : i32] | tensor<[672,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,1,168,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 168 : i32] | tensor<[1,1,1,168,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 672 : i32] | tensor<[1,1,1,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[672,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [672 : i32, 1 : i32, 1 : i32] | tensor<[672,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 16 : i32] | tensor<[1,1,12544,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 99.0 | +| ttnn.reshape | tensor<[1,1,12544,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 96 : i32] | tensor<[1,112,112,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 99.0 | +| ttnn.reshape | tensor<[1,112,112,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 16 : i32] | tensor<[1,1,12544,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 96 : i32] | tensor<[1,112,112,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 16 : i32] | tensor<[1,1,196,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.28 | 1.41 | +| ttnn.reshape | tensor<[1,1,196,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 4 : i32] | tensor<[1,14,14,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.28 | 1.41 | +| ttnn.reshape | tensor<[4,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [4 : i32, 1 : i32, 1 : i32] | tensor<[4,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.28 | 1.41 | +| ttnn.reshape | tensor<[1,14,14,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 16 : i32] | tensor<[1,1,196,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 4 : i32] | tensor<[1,14,14,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [4 : i32, 1 : i32, 1 : i32] | tensor<[4,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25600 : i32, 16 : i32] | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32, 160 : i32, 16 : i32] | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25600 : i32, 16 : i32] | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32, 160 : i32, 16 : i32] | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25600 : i32, 16 : i32] | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.32 | +| ttnn.reshape | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32, 160 : i32, 16 : i32] | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.32 | +| ttnn.reshape | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25600 : i32, 16 : i32] | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32, 160 : i32, 16 : i32] | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25600 : i32, 16 : i32] | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,1,25600,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32, 160 : i32, 64 : i32] | tensor<[1,160,160,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25600 : i32, 16 : i32] | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25600,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32, 160 : i32, 64 : i32] | tensor<[1,160,160,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,172,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 172 : i32] | tensor<[1,1,784,172,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | yes | 0.62 | 3.62 | +| ttnn.reshape | tensor<[1,1,784,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 46 : i32] | tensor<[1,28,28,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.62 | 3.62 | +| ttnn.reshape | tensor<[1,28,28,172,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 172 : i32] | tensor<[1,1,784,172,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 46 : i32] | tensor<[1,28,28,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 184 : i32] | tensor<[1,1,400,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,400,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 184 : i32] | tensor<[1,20,20,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,20,20,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 184 : i32] | tensor<[1,1,400,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 184 : i32] | tensor<[1,20,20,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 80 : i32] | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,20,20,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 184 : i32] | tensor<[1,1,400,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 80 : i32] | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,185,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 185 : i32] | tensor<[1,1,784,185,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | yes | 0.51 | 2.03 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.51 | 2.03 | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.51 | 2.03 | +| ttnn.reshape | tensor<[1,28,28,185,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 185 : i32] | tensor<[1,1,784,185,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1920 : i32] | tensor<[1,1,256,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 60, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 17.12 | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 17.12 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 17.12 | +| ttnn.reshape | tensor<[1,16,16,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1920 : i32] | tensor<[1,1,256,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 60, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1920 : i32] | tensor<[1,1,256,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 60, 'tile<32x32, bf16>', 'dram') | yes | 0.82 | 44.75 | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.82 | 44.75 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.82 | 44.75 | +| ttnn.reshape | tensor<[1,16,16,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 1920 : i32] | tensor<[1,1,256,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 60, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1920 : i32] | tensor<[1,1,1024,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 60, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 2.0 | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 2.0 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 2.0 | +| ttnn.reshape | tensor<[1,32,32,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1920 : i32] | tensor<[1,1,1024,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 60, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1920 : i32] | tensor<[1,1,1024,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 60, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,32,32,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 1920 : i32] | tensor<[1,1,1024,1920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 60, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 192 : i32] | tensor<[1,1,196,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 6, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 64 : i32] | tensor<[1,14,14,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 192 : i32] | tensor<[1,1,196,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 64 : i32] | tensor<[1,14,14,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 192 : i32] | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 192 : i32] | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 192 : i32] | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 192 : i32] | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 192 : i32] | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | yes | 0.21 | 27.88 | +| ttnn.reshape | tensor<[1,1,196,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 192 : i32] | tensor<[1,14,14,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 6, 'tile<32x32, bf16>', 'dram') | yes | 0.21 | 27.88 | +| ttnn.reshape | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 192 : i32] | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 192 : i32] | tensor<[1,14,14,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 192 : i32] | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 32 : i32] | tensor<[1,28,28,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 192 : i32] | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 32 : i32] | tensor<[1,28,28,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,196,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 196 : i32] | tensor<[1,1,196,196,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 6.87875487818694e+37 | +| ttnn.reshape | tensor<[1,1,196,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 40 : i32] | tensor<[1,14,14,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 6.87875487818694e+37 | +| ttnn.reshape | tensor<[1,14,14,196,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 196 : i32] | tensor<[1,1,196,196,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 40 : i32] | tensor<[1,14,14,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 1 : i32] | tensor<[1,1,784,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.37 | 4.44 | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.37 | 4.44 | +| ttnn.reshape | tensor<[16,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 1 : i32] | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.37 | 4.44 | +| ttnn.reshape | tensor<[1,28,28,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 1 : i32] | tensor<[1,1,784,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 1 : i32] | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 1 : i32] | tensor<[1,1,784,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.58 | 1.28 | +| ttnn.reshape | tensor<[1,1,676,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 676 + d1 * 26 + d2, d3), memory_config: (22, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 26 : i32, 26 : i32, 32 : i32] | tensor<[1,26,26,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 676 + d1 * 26 + d2, d3), memory_config: (22, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.58 | 1.28 | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.58 | 1.28 | +| ttnn.reshape | tensor<[1,28,28,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 1 : i32] | tensor<[1,1,784,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,676,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 676 + d1 * 26 + d2, d3), memory_config: (22, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 26 : i32, 26 : i32, 32 : i32] | tensor<[1,26,26,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 676 + d1 * 26 + d2, d3), memory_config: (22, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 200 : i32] | tensor<[1,1,400,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,1,400,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 200 : i32] | tensor<[1,20,20,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.04 | +| ttnn.reshape | tensor<[1,20,20,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 200 : i32] | tensor<[1,1,400,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 200 : i32] | tensor<[1,20,20,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 200 : i32] | tensor<[1,1,400,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 80 : i32] | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,20,20,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 200 : i32] | tensor<[1,1,400,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 80 : i32] | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,20,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 300 : i32, 2048 : i32] | tensor<[1,1,300,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 8.307674973655725e+37 | +| ttnn.reshape | tensor<[1,1,300,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 2048 : i32] | tensor<[1,15,20,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 8.307674973655725e+37 | +| ttnn.reshape | tensor<[2048,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [2048 : i32, 1 : i32, 1 : i32] | tensor<[2048,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 8.307674973655725e+37 | +| ttnn.reshape | tensor<[1,15,20,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 300 : i32, 2048 : i32] | tensor<[1,1,300,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,300,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 2048 : i32] | tensor<[1,15,20,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2048,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [2048 : i32, 1 : i32, 1 : i32] | tensor<[2048,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,23,40,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 920 : i32, 2048 : i32] | tensor<[1,1,920,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | yes | 0.09 | 7.44 | +| ttnn.reshape | tensor<[1,1,920,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 256 : i32] | tensor<[1,23,40,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.09 | 7.44 | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.09 | 7.44 | +| ttnn.reshape | tensor<[1,23,40,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 920 : i32, 2048 : i32] | tensor<[1,1,920,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,920,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 256 : i32] | tensor<[1,23,40,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,23,40,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 920 : i32, 2048 : i32] | tensor<[1,1,920,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | yes | -0.24 | 3.7 | +| ttnn.reshape | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 512 : i32] | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | yes | -0.24 | 3.7 | +| ttnn.reshape | tensor<[1,23,40,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 920 : i32, 2048 : i32] | tensor<[1,1,920,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 512 : i32] | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 2048 : i32] | tensor<[1,1,49,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 7.244292577027791e+37 | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 7.244292577027791e+37 | +| ttnn.reshape | tensor<[1,7,7,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 2048 : i32] | tensor<[1,1,49,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,218,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 218 : i32] | tensor<[1,1,784,218,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.21 | 5.34 | +| ttnn.reshape | tensor<[1,1,784,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 78 : i32] | tensor<[1,28,28,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.21 | 5.34 | +| ttnn.reshape | tensor<[1,28,28,218,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 218 : i32] | tensor<[1,1,784,218,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 78 : i32] | tensor<[1,28,28,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,236,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 236 : i32] | tensor<[1,1,196,236,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 7.011677677765431e+37 | +| ttnn.reshape | tensor<[1,1,196,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 68 : i32] | tensor<[1,14,14,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 7.011677677765431e+37 | +| ttnn.reshape | tensor<[1,14,14,236,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 236 : i32] | tensor<[1,1,196,236,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 68 : i32] | tensor<[1,14,14,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 240 : i32] | tensor<[1,1,400,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,20,20,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 240 : i32] | tensor<[1,1,400,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 80 : i32] | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,40,40,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 240 : i32] | tensor<[1,1,1600,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,400,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 240 : i32] | tensor<[1,20,20,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,40,40,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 240 : i32] | tensor<[1,1,1600,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 240 : i32] | tensor<[1,20,20,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 72 : i32] | tensor<[1,1,1,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[72,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [72 : i32, 1 : i32, 1 : i32] | tensor<[72,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 24 : i32] | tensor<[1,1,1,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 72 : i32] | tensor<[1,1,1,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[72,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [72 : i32, 1 : i32, 1 : i32] | tensor<[72,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 24 : i32] | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 2.272979872792206e+38 | +| ttnn.reshape | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 144 : i32] | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 2.272979872792206e+38 | +| ttnn.reshape | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 24 : i32] | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 144 : i32] | tensor<[1,56,56,144,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 24 : i32] | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.42 | 1.91 | +| ttnn.reshape | tensor<[1,1,3136,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 14 : i32] | tensor<[1,56,56,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.42 | 1.91 | +| ttnn.reshape | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 24 : i32] | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 14 : i32] | tensor<[1,56,56,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,80,80,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 24 : i32] | tensor<[1,1,6400,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 72 : i32] | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,80,80,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 24 : i32] | tensor<[1,1,6400,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 72 : i32] | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 2560 : i32] | tensor<[1,1,256,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 80, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 13.38 | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 13.38 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 13.38 | +| ttnn.reshape | tensor<[1,16,16,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 2560 : i32] | tensor<[1,1,256,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 80, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 2560 : i32] | tensor<[1,1,256,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 80, 'tile<32x32, bf16>', 'dram') | yes | 0.59 | 29.62 | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.59 | 29.62 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.59 | 29.62 | +| ttnn.reshape | tensor<[1,16,16,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 2560 : i32] | tensor<[1,1,256,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 80, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,8,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 64 : i32, 2560 : i32] | tensor<[1,1,64,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 80, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 14.38 | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 14.38 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 14.38 | +| ttnn.reshape | tensor<[1,8,8,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 64 : i32, 2560 : i32] | tensor<[1,1,64,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 80, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,8,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 64 : i32, 2560 : i32] | tensor<[1,1,64,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 80, 'tile<32x32, bf16>', 'dram') | yes | -0.12 | 16.62 | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | -0.12 | 16.62 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.12 | 16.62 | +| ttnn.reshape | tensor<[1,8,8,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 64 : i32, 2560 : i32] | tensor<[1,1,64,2560,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 80, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,64,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,10,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 256 : i32] | tensor<[1,1,100,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 8, 'tile<32x32, f32>', 'dram') | yes | -0.01 | 60160.01 | +| ttnn.reshape | tensor<[1,1,25,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 256 : i32] | tensor<[1,5,5,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | -0.01 | 60160.01 | +| ttnn.reshape | tensor<[1,10,10,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 256 : i32] | tensor<[1,1,100,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 256 : i32] | tensor<[1,5,5,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 256 : i32] | tensor<[1,1,12544,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.88 | 0.06 | +| ttnn.reshape | tensor<[1,1,12544,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 128 : i32] | tensor<[1,112,112,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.88 | 0.06 | +| ttnn.reshape | tensor<[1,112,112,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 256 : i32] | tensor<[1,1,12544,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 128 : i32] | tensor<[1,112,112,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,120,160,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 256 : i32] | tensor<[1,1,19200,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.32 | 20.25 | +| ttnn.reshape | tensor<[1,1,19200,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 256 : i32] | tensor<[1,120,160,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.32 | 20.25 | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.32 | 20.25 | +| ttnn.reshape | tensor<[1,120,160,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 256 : i32] | tensor<[1,1,19200,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 256 : i32] | tensor<[1,120,160,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 256 : i32] | tensor<[1,1,16384,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.75 | +| ttnn.reshape | tensor<[1,1,16384,150,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 150 : i32] | tensor<[1,128,128,150,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.75 | +| ttnn.reshape | tensor<[150,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, bf16>', 'dram') | shape: [150 : i32, 1 : i32, 1 : i32] | tensor<[150,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.75 | +| ttnn.reshape | tensor<[1,128,128,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 256 : i32] | tensor<[1,1,16384,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,150,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 150 : i32] | tensor<[1,128,128,150,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[150,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, bf16>', 'dram') | shape: [150 : i32, 1 : i32, 1 : i32] | tensor<[150,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 256 : i32] | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.42 | 11.12 | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.42 | 11.12 | +| ttnn.reshape | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 256 : i32] | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 256 : i32] | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.32 | 1.27 | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.32 | 1.27 | +| ttnn.reshape | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 256 : i32] | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 256 : i32] | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 1.06 | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 1.06 | +| ttnn.reshape | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 256 : i32] | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 256 : i32] | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.2095974761642734e+38 | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.2095974761642734e+38 | +| ttnn.reshape | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 256 : i32] | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 256 : i32] | tensor<[1,1,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.82 | 2.42 | +| ttnn.reshape | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 512 : i32] | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.82 | 2.42 | +| ttnn.reshape | tensor<[1,16,16,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 256 : i32] | tensor<[1,1,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 512 : i32] | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,180,320,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 256 : i32] | tensor<[1,1,57600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,57600,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 128 : i32] | tensor<[1,180,320,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,180,320,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 256 : i32] | tensor<[1,1,57600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,57600,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 128 : i32] | tensor<[1,180,320,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,180,320,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 256 : i32] | tensor<[1,1,57600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 512 : i32] | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,180,320,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 256 : i32] | tensor<[1,1,57600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 512 : i32] | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 64 : i32] | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,180,320,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 256 : i32] | tensor<[1,1,57600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 64 : i32] | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 1.84 | +| ttnn.reshape | tensor<[1,1,784,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 20 : i32] | tensor<[1,28,28,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 1.84 | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 20 : i32] | tensor<[1,28,28,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.16 | 20.25 | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.16 | 20.25 | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 256 : i32] | tensor<[1,14,14,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.11 | 0.53 | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.11 | 0.53 | +| ttnn.reshape | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 256 : i32] | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 256 : i32] | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,4,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 24 : i32] | tensor<[1,2,2,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 256 : i32] | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 24 : i32] | tensor<[1,2,2,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 256 : i32] | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 256 : i32] | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 256 : i32] | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 256 : i32] | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 256 : i32] | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[1,1,4,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 546 : i32] | tensor<[1,2,2,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 256 : i32] | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 546 : i32] | tensor<[1,2,2,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 256 : i32] | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,4,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 64 : i32] | tensor<[1,2,2,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,2,2,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 256 : i32] | tensor<[1,1,4,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32, 2 : i32, 64 : i32] | tensor<[1,2,2,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,1,1024,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 128 : i32] | tensor<[1,32,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 128 : i32] | tensor<[1,32,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.07 | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.07 | +| ttnn.reshape | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.41 | +| ttnn.reshape | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 512 : i32] | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.41 | +| ttnn.reshape | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 512 : i32] | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 256 : i32] | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 128 : i32] | tensor<[1,3,3,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 256 : i32] | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 128 : i32] | tensor<[1,3,3,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,9,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 24 : i32] | tensor<[1,3,3,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 256 : i32] | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,9,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 24 : i32] | tensor<[1,3,3,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 256 : i32] | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 256 : i32] | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 256 : i32] | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 256 : i32] | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 256 : i32] | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.24 | +| ttnn.reshape | tensor<[1,1,9,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 546 : i32] | tensor<[1,3,3,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.24 | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.24 | +| ttnn.reshape | tensor<[1,3,3,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32, 256 : i32] | tensor<[1,1,9,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,9,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 3 : i32, 546 : i32] | tensor<[1,3,3,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9 + d1 * 3 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 256 : i32] | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.95 | 0.44 | +| ttnn.reshape | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 1024 : i32] | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.95 | 0.44 | +| ttnn.reshape | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 256 : i32] | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 1024 : i32] | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 256 : i32] | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.95 | 0.84 | +| ttnn.reshape | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 256 : i32] | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.95 | 0.84 | +| ttnn.reshape | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 256 : i32] | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 256 : i32] | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 256 : i32] | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 256 : i32] | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 256 : i32] | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.33 | 0.05 | +| ttnn.reshape | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 256 : i32] | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.33 | 0.05 | +| ttnn.reshape | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 256 : i32] | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 256 : i32] | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 256 : i32] | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 256 : i32] | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 256 : i32] | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 64 : i32] | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 256 : i32] | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 64 : i32] | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 256 : i32] | tensor<[1,1,25,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 512 : i32] | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,5,5,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 256 : i32] | tensor<[1,1,25,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 512 : i32] | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.09 | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.09 | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.08 | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.08 | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 255 : i32] | tensor<[1,64,64,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[255,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [255 : i32, 1 : i32, 1 : i32] | tensor<[255,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 255 : i32] | tensor<[1,64,64,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[255,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [255 : i32, 1 : i32, 1 : i32] | tensor<[255,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 12.62 | +| ttnn.reshape | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 256 : i32] | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 12.62 | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 12.62 | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 256 : i32] | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.88 | +| ttnn.reshape | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 512 : i32] | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.88 | +| ttnn.reshape | tensor<[1,64,64,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 256 : i32] | tensor<[1,1,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 512 : i32] | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,90,160,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 256 : i32] | tensor<[1,1,14400,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 256 : i32] | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,90,160,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 256 : i32] | tensor<[1,1,14400,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 256 : i32] | tensor<[1,45,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,262,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 262 : i32] | tensor<[1,1,784,262,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 9, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 2.5 | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 2.5 | +| ttnn.reshape | tensor<[1,28,28,262,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 262 : i32] | tensor<[1,1,784,262,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 9, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,272,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 272 : i32] | tensor<[1,1,49,272,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 9, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 7.244292577027791e+37 | +| ttnn.reshape | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 160 : i32] | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 7.244292577027791e+37 | +| ttnn.reshape | tensor<[1,7,7,272,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 272 : i32] | tensor<[1,1,49,272,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 9, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 160 : i32] | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,276,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 276 : i32] | tensor<[1,1,784,276,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 9, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 2.25 | +| ttnn.reshape | tensor<[1,1,784,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 34 : i32] | tensor<[1,28,28,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 2.25 | +| ttnn.reshape | tensor<[1,28,28,276,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 276 : i32] | tensor<[1,1,784,276,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 9, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 34 : i32] | tensor<[1,28,28,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 28 : i32] | tensor<[1,1,784,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 7.643060975763265e+37 | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 7.643060975763265e+37 | +| ttnn.reshape | tensor<[1,28,28,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 28 : i32] | tensor<[1,1,784,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,296,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 296 : i32] | tensor<[1,1,784,296,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.4 | 4.97 | +| ttnn.reshape | tensor<[1,1,784,134,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 134 : i32] | tensor<[1,28,28,134,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.4 | 4.97 | +| ttnn.reshape | tensor<[1,28,28,296,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 296 : i32] | tensor<[1,1,784,296,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,134,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 134 : i32] | tensor<[1,28,28,134,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,304,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 304 : i32] | tensor<[1,1,196,304,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.3 | 2.14 | +| ttnn.reshape | tensor<[1,1,196,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 116 : i32] | tensor<[1,14,14,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.3 | 2.14 | +| ttnn.reshape | tensor<[1,14,14,304,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 304 : i32] | tensor<[1,1,196,304,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 116 : i32] | tensor<[1,14,14,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,310,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 310 : i32] | tensor<[1,1,784,310,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.34 | 1.91 | +| ttnn.reshape | tensor<[1,1,784,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 58 : i32] | tensor<[1,28,28,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.34 | 1.91 | +| ttnn.reshape | tensor<[1,28,28,310,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 310 : i32] | tensor<[1,1,784,310,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 58 : i32] | tensor<[1,28,28,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 320 : i32] | tensor<[1,1,196,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.09 | 1.24 | +| ttnn.reshape | tensor<[1,1,196,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 40 : i32] | tensor<[1,14,14,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.09 | 1.24 | +| ttnn.reshape | tensor<[1,14,14,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 320 : i32] | tensor<[1,1,196,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 40 : i32] | tensor<[1,14,14,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1,1200,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,300,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 320 : i32] | tensor<[1,15,20,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1,1200,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,300,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 320 : i32] | tensor<[1,15,20,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1,1200,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | inf | +| ttnn.reshape | tensor<[1,1,300,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 512 : i32] | tensor<[1,15,20,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | inf | +| ttnn.reshape | tensor<[512,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [512 : i32, 1 : i32, 1 : i32] | tensor<[512,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | inf | +| ttnn.reshape | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1,1200,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,300,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 512 : i32] | tensor<[1,15,20,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[512,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [512 : i32, 1 : i32, 1 : i32] | tensor<[512,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1,1200,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 5.81 | +| ttnn.reshape | tensor<[1,1,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 64 : i32] | tensor<[1,30,40,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 5.81 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 5.81 | +| ttnn.reshape | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1,1200,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 64 : i32] | tensor<[1,30,40,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 320 : i32] | tensor<[1,1,1024,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,32,32,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 320 : i32] | tensor<[1,1,1024,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 320 : i32] | tensor<[1,1,1024,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.38 | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.38 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.38 | +| ttnn.reshape | tensor<[1,32,32,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 320 : i32] | tensor<[1,1,1024,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 320 : i32] | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 320 : i32] | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 320 : i32] | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 320 : i32] | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 320 : i32] | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.53 | +| ttnn.reshape | tensor<[1,1,1024,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 320 : i32] | tensor<[1,32,32,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.53 | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.53 | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 320 : i32] | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 320 : i32] | tensor<[1,32,32,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 320 : i32] | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,1,4096,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 4 : i32] | tensor<[1,64,64,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[4,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [4 : i32, 1 : i32, 1 : i32] | tensor<[4,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 320 : i32] | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 4 : i32] | tensor<[1,64,64,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [4 : i32, 1 : i32, 1 : i32] | tensor<[4,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 320 : i32] | tensor<[1,1,49,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 14.25 | +| ttnn.reshape | tensor<[1,1,49,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 1280 : i32] | tensor<[1,7,7,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 14.25 | +| ttnn.reshape | tensor<[1,7,7,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 320 : i32] | tensor<[1,1,49,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 1280 : i32] | tensor<[1,7,7,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,328,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 11, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 328 : i32] | tensor<[1,1,784,328,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 11, 'tile<32x32, bf16>', 'dram') | yes | 0.12 | 2.0 | +| ttnn.reshape | tensor<[1,1,784,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 320 : i32] | tensor<[1,28,28,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.12 | 2.0 | +| ttnn.reshape | tensor<[1,28,28,328,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 11, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 328 : i32] | tensor<[1,1,784,328,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 11, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 320 : i32] | tensor<[1,28,28,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 32 : i32] | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.75 | 22.25 | +| ttnn.reshape | tensor<[1,1,12544,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 16 : i32] | tensor<[1,112,112,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.75 | 22.25 | +| ttnn.reshape | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 32 : i32] | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 16 : i32] | tensor<[1,112,112,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 32 : i32] | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.41 | 10.12 | +| ttnn.reshape | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 32 : i32] | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.41 | 10.12 | +| ttnn.reshape | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 32 : i32] | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 32 : i32] | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 32 : i32] | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.83 | 62.75 | +| ttnn.reshape | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 64 : i32] | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.83 | 62.75 | +| ttnn.reshape | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 32 : i32] | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 64 : i32] | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 32 : i32] | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.45 | 11.88 | +| ttnn.reshape | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 64 : i32] | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.45 | 11.88 | +| ttnn.reshape | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 32 : i32] | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 64 : i32] | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,120,160,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 32 : i32] | tensor<[1,1,19200,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,1,19200,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 2 : i32] | tensor<[1,120,160,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[2,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 1 : i32, 1 : i32] | tensor<[2,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,120,160,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 32 : i32] | tensor<[1,1,19200,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 2 : i32] | tensor<[1,120,160,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 1 : i32, 1 : i32] | tensor<[2,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 32 : i32] | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 69.5 | +| ttnn.reshape | tensor<[1,1,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 32 : i32] | tensor<[1,16,16,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 69.5 | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 69.5 | +| ttnn.reshape | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 32 : i32] | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 32 : i32] | tensor<[1,16,16,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 32 : i32] | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[1,1,4096,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 64 : i32] | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 32 : i32] | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 64 : i32] | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 32 : i32] | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 64 : i32] | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 32 : i32] | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 64 : i32] | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[120,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32, 1 : i32] | tensor<[120,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 120 : i32] | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[120,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32, 1 : i32] | tensor<[120,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 32 : i32] | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,65536,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 1 : i32] | tensor<[1,256,256,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 32 : i32] | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,65536,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 1 : i32] | tensor<[1,256,256,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 32 : i32] | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 32 : i32] | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 32 : i32] | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 32 : i32] | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 32 : i32] | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 64 : i32] | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 32 : i32] | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 64 : i32] | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,26,26,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 676 + d1 * 26 + d2, d3), memory_config: (22, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 676 : i32, 32 : i32] | tensor<[1,1,676,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 676 + d1 * 26 + d2, d3), memory_config: (22, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 7.643060975763265e+37 | +| ttnn.reshape | tensor<[1,1,576,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 24 + d2, d3), memory_config: (18, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 24 : i32, 24 : i32, 64 : i32] | tensor<[1,24,24,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 24 + d2, d3), memory_config: (18, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 7.643060975763265e+37 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 7.643060975763265e+37 | +| ttnn.reshape | tensor<[1,26,26,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 676 + d1 * 26 + d2, d3), memory_config: (22, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 676 : i32, 32 : i32] | tensor<[1,1,676,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 676 + d1 * 26 + d2, d3), memory_config: (22, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,576,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 24 + d2, d3), memory_config: (18, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 24 : i32, 24 : i32, 64 : i32] | tensor<[1,24,24,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 24 + d2, d3), memory_config: (18, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 32 : i32] | tensor<[1,1,784,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.04 | 33.0 | +| ttnn.reshape | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 192 : i32] | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | yes | 0.04 | 33.0 | +| ttnn.reshape | tensor<[1,28,28,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 32 : i32] | tensor<[1,1,784,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 192 : i32] | tensor<[1,28,28,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,30,40,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 32 : i32] | tensor<[1,1,1200,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 6.280602280083728e+37 | +| ttnn.reshape | tensor<[1,1,1200,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 2 : i32] | tensor<[1,30,40,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 6.280602280083728e+37 | +| ttnn.reshape | tensor<[2,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 1 : i32, 1 : i32] | tensor<[2,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 6.280602280083728e+37 | +| ttnn.reshape | tensor<[1,30,40,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 32 : i32] | tensor<[1,1,1200,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1200,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 2 : i32] | tensor<[1,30,40,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 1 : i32, 1 : i32] | tensor<[2,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,512,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 262144 : i32, 32 : i32] | tensor<[1,1,262144,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.5 | +| ttnn.reshape | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 64 : i32] | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.5 | +| ttnn.reshape | tensor<[1,512,512,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 262144 : i32, 32 : i32] | tensor<[1,1,262144,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 64 : i32] | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,60,80,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 32 : i32] | tensor<[1,1,4800,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.84 | 3.55 | +| ttnn.reshape | tensor<[1,1,4800,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 2 : i32] | tensor<[1,60,80,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.84 | 3.55 | +| ttnn.reshape | tensor<[2,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 1 : i32, 1 : i32] | tensor<[2,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.84 | 3.55 | +| ttnn.reshape | tensor<[1,60,80,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 32 : i32] | tensor<[1,1,4800,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4800,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 2 : i32] | tensor<[1,60,80,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 1 : i32, 1 : i32] | tensor<[2,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 34 : i32] | tensor<[1,1,784,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.38 | 0.93 | +| ttnn.reshape | tensor<[1,1,784,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 20 : i32] | tensor<[1,28,28,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.38 | 0.93 | +| ttnn.reshape | tensor<[1,28,28,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 34 : i32] | tensor<[1,1,784,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 20 : i32] | tensor<[1,28,28,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,360,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 360 : i32] | tensor<[1,1,196,360,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 3.66 | +| ttnn.reshape | tensor<[1,1,196,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 68 : i32] | tensor<[1,14,14,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 3.66 | +| ttnn.reshape | tensor<[1,14,14,360,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 360 : i32] | tensor<[1,1,196,360,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 68 : i32] | tensor<[1,14,14,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,368,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 368 : i32] | tensor<[1,1,784,368,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 12, 'tile<32x32, bf16>', 'dram') | yes | 0.26 | 2.33 | +| ttnn.reshape | tensor<[1,1,784,98,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 98 : i32] | tensor<[1,28,28,98,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.26 | 2.33 | +| ttnn.reshape | tensor<[1,28,28,368,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 368 : i32] | tensor<[1,1,784,368,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,98,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 98 : i32] | tensor<[1,28,28,98,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 384 : i32] | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 384 : i32] | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 384 : i32] | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 384 : i32] | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 384 : i32] | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 64 : i32] | tensor<[1,14,14,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 384 : i32] | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 64 : i32] | tensor<[1,14,14,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 384 : i32] | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 96 : i32] | tensor<[1,14,14,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 384 : i32] | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 96 : i32] | tensor<[1,14,14,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 384 : i32] | tensor<[1,1,4096,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 12, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[1,64,64,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 384 : i32] | tensor<[1,1,4096,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1024,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 1 : i32, 1 : i32] | tensor<[1024,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1024,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 1 : i32, 1 : i32] | tensor<[1024,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 32 : i32] | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 32 : i32] | tensor<[1,112,112,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 224 : i32, 224 : i32, 64 : i32] | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 224 : i32, 224 : i32, 64 : i32] | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.26 | +| ttnn.reshape | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 64 : i32] | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.26 | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 64 : i32] | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 768 : i32] | tensor<[1,14,14,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[768,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [768 : i32, 1 : i32, 1 : i32] | tensor<[768,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 768 : i32] | tensor<[1,14,14,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[768,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [768 : i32, 1 : i32, 1 : i32] | tensor<[768,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,49,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 768 : i32] | tensor<[1,7,7,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,224,224,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 3 : i32] | tensor<[1,1,50176,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 768 : i32] | tensor<[1,7,7,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 3 : i32] | tensor<[1,1,65536,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,256,256,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 3 : i32] | tensor<[1,1,65536,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 32 : i32] | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,320,320,3,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 102400 + d1 * 320 + d2, d3), memory_config: (3200, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 102400 : i32, 3 : i32] | tensor<[1,1,102400,3,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 102400 + d1 * 320 + d2, d3), memory_config: (3200, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32, 160 : i32, 16 : i32] | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,320,320,3,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 102400 + d1 * 320 + d2, d3), memory_config: (3200, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 102400 : i32, 3 : i32] | tensor<[1,1,102400,3,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 102400 + d1 * 320 + d2, d3), memory_config: (3200, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25600,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32, 160 : i32, 16 : i32] | tensor<[1,160,160,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,384,512,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196608 + d1 * 512 + d2, d3), memory_config: (6144, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196608 : i32, 3 : i32] | tensor<[1,1,196608,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196608 + d1 * 512 + d2, d3), memory_config: (6144, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,192,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32, 768 : i32] | tensor<[1,12,16,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 24, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[768,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [768 : i32, 1 : i32, 1 : i32] | tensor<[768,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,384,512,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196608 + d1 * 512 + d2, d3), memory_config: (6144, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196608 : i32, 3 : i32] | tensor<[1,1,196608,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196608 + d1 * 512 + d2, d3), memory_config: (6144, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,192,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32, 768 : i32] | tensor<[1,12,16,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[768,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [768 : i32, 1 : i32, 1 : i32] | tensor<[768,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,480,640,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 307200 : i32, 3 : i32] | tensor<[1,1,307200,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 64 : i32] | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[1,480,640,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 307200 : i32, 3 : i32] | tensor<[1,1,307200,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 64 : i32] | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,512,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 262144 : i32, 3 : i32] | tensor<[1,1,262144,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.09 | +| ttnn.reshape | tensor<[1,1,262144,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 512 : i32, 32 : i32] | tensor<[1,512,512,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.09 | +| ttnn.reshape | tensor<[1,512,512,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 262144 : i32, 3 : i32] | tensor<[1,1,262144,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,262144,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 512 : i32, 32 : i32] | tensor<[1,512,512,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,512,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 262144 : i32, 3 : i32] | tensor<[1,1,262144,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.11 | +| ttnn.reshape | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 32 : i32] | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.11 | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.11 | +| ttnn.reshape | tensor<[1,512,512,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 262144 : i32, 3 : i32] | tensor<[1,1,262144,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 262144 + d1 * 512 + d2, d3), memory_config: (8192, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 32 : i32] | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,672,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 344064 + d1 * 672 + d2, d3), memory_config: (10752, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 344064 : i32, 3 : i32] | tensor<[1,1,344064,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 344064 + d1 * 672 + d2, d3), memory_config: (10752, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1344,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1344 + d1 * 42 + d2, d3), memory_config: (42, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 42 : i32, 192 : i32] | tensor<[1,32,42,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1344 + d1 * 42 + d2, d3), memory_config: (42, 6, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[192,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | shape: [192 : i32, 1 : i32, 1 : i32] | tensor<[192,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,512,672,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 344064 + d1 * 672 + d2, d3), memory_config: (10752, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 344064 : i32, 3 : i32] | tensor<[1,1,344064,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 344064 + d1 * 672 + d2, d3), memory_config: (10752, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1344,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1344 + d1 * 42 + d2, d3), memory_config: (42, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 42 : i32, 192 : i32] | tensor<[1,32,42,192,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1344 + d1 * 42 + d2, d3), memory_config: (42, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[192,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | shape: [192 : i32, 1 : i32, 1 : i32] | tensor<[192,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,720,1280,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 921600 + d1 * 1280 + d2, d3), memory_config: (28800, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 921600 : i32, 3 : i32] | tensor<[1,1,921600,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 921600 + d1 * 1280 + d2, d3), memory_config: (28800, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,230400,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 230400 + d1 * 640 + d2, d3), memory_config: (7200, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 360 : i32, 640 : i32, 64 : i32] | tensor<[1,360,640,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 230400 + d1 * 640 + d2, d3), memory_config: (7200, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,720,1280,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 921600 + d1 * 1280 + d2, d3), memory_config: (28800, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 921600 : i32, 3 : i32] | tensor<[1,1,921600,3,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 921600 + d1 * 1280 + d2, d3), memory_config: (28800, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,230400,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 230400 + d1 * 640 + d2, d3), memory_config: (7200, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 360 : i32, 640 : i32, 64 : i32] | tensor<[1,360,640,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 230400 + d1 * 640 + d2, d3), memory_config: (7200, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,40,40,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 40 : i32] | tensor<[1,1,1600,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,1600,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 120 : i32] | tensor<[1,40,40,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,40,40,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 40 : i32] | tensor<[1,1,1600,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1600,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 120 : i32] | tensor<[1,40,40,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,40,40,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 40 : i32] | tensor<[1,1,1600,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,1600,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 240 : i32] | tensor<[1,40,40,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,40,40,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 40 : i32] | tensor<[1,1,1600,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1600,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 240 : i32] | tensor<[1,40,40,240,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 40 : i32] | tensor<[1,1,3136,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 8.340905673550347e+37 | +| ttnn.reshape | tensor<[1,1,3136,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 14 : i32] | tensor<[1,56,56,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 8.340905673550347e+37 | +| ttnn.reshape | tensor<[1,56,56,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 40 : i32] | tensor<[1,1,3136,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 14 : i32] | tensor<[1,56,56,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,428,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 14, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 428 : i32] | tensor<[1,1,196,428,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 14, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 4.419683085984844e+37 | +| ttnn.reshape | tensor<[1,1,196,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 116 : i32] | tensor<[1,14,14,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 4.419683085984844e+37 | +| ttnn.reshape | tensor<[1,14,14,428,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 14, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 428 : i32] | tensor<[1,1,196,428,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 14, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 116 : i32] | tensor<[1,14,14,116,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,466,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 15, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 466 : i32] | tensor<[1,1,784,466,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 15, 'tile<32x32, bf16>', 'dram') | yes | 0.36 | 3.27 | +| ttnn.reshape | tensor<[1,1,784,168,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 168 : i32] | tensor<[1,28,28,168,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | yes | 0.36 | 3.27 | +| ttnn.reshape | tensor<[1,28,28,466,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 15, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 466 : i32] | tensor<[1,1,784,466,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 15, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,168,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 168 : i32] | tensor<[1,28,28,168,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 46 : i32] | tensor<[1,1,784,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.31 | 2.06 | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.31 | 2.06 | +| ttnn.reshape | tensor<[1,28,28,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 46 : i32] | tensor<[1,1,784,46,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,100,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 24 : i32] | tensor<[1,10,10,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 24 : i32] | tensor<[1,10,10,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 256 : i32] | tensor<[1,10,10,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 256 : i32] | tensor<[1,10,10,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 480 : i32] | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 480 : i32] | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 480 : i32] | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 480 : i32] | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.35 | +| ttnn.reshape | tensor<[1,1,100,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 546 : i32] | tensor<[1,10,10,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 18, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.35 | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.35 | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 546 : i32] | tensor<[1,10,10,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 18, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 80 : i32] | tensor<[1,10,10,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 480 : i32] | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 80 : i32] | tensor<[1,10,10,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 480 : i32] | tensor<[1,1,1,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[120,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32, 1 : i32] | tensor<[120,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 480 : i32] | tensor<[1,1,1,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 120 : i32] | tensor<[1,1,1,120,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[120,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32, 1 : i32] | tensor<[120,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 480 : i32] | tensor<[1,1,400,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,400,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 112 : i32] | tensor<[1,20,20,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,20,20,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 480 : i32] | tensor<[1,1,400,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 112 : i32] | tensor<[1,20,20,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 480 : i32] | tensor<[1,1,400,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,400,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 480 : i32] | tensor<[1,20,20,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,20,20,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 480 : i32] | tensor<[1,1,400,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 480 : i32] | tensor<[1,20,20,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 4 : i32] | tensor<[1,1,4096,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,64,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 4 : i32] | tensor<[1,1,4096,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 512 : i32] | tensor<[1,1,196,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 0.46 | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 0.46 | +| ttnn.reshape | tensor<[1,14,14,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 512 : i32] | tensor<[1,1,196,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 512 : i32] | tensor<[1,1,196,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 7.1113697774493e+37 | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 7.1113697774493e+37 | +| ttnn.reshape | tensor<[1,14,14,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 512 : i32] | tensor<[1,1,196,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,20,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 300 : i32, 512 : i32] | tensor<[1,1,300,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 13.62 | +| ttnn.reshape | tensor<[1,1,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 64 : i32] | tensor<[1,15,20,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 13.62 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 13.62 | +| ttnn.reshape | tensor<[1,15,20,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 300 : i32, 512 : i32] | tensor<[1,1,300,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 64 : i32] | tensor<[1,15,20,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 512 : i32] | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.54 | 11.69 | +| ttnn.reshape | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1024 : i32] | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.54 | 11.69 | +| ttnn.reshape | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 512 : i32] | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1024 : i32] | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 512 : i32] | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.39 | 4.16 | +| ttnn.reshape | tensor<[1,1,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 256 : i32] | tensor<[1,16,16,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.39 | 4.16 | +| ttnn.reshape | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 512 : i32] | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 256 : i32] | tensor<[1,16,16,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 512 : i32] | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 512 : i32] | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 512 : i32] | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 512 : i32] | tensor<[1,16,16,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 920 : i32, 512 : i32] | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.44 | 0.3 | +| ttnn.reshape | tensor<[1,1,920,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 2048 : i32] | tensor<[1,23,40,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | yes | 0.44 | 0.3 | +| ttnn.reshape | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 920 : i32, 512 : i32] | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,920,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 2048 : i32] | tensor<[1,23,40,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 920 : i32, 512 : i32] | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 0.39 | +| ttnn.reshape | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 512 : i32] | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 0.39 | +| ttnn.reshape | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 920 : i32, 512 : i32] | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 512 : i32] | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.33 | 13.81 | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.33 | 13.81 | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 1024 : i32] | tensor<[1,14,14,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.43 | 22.5 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.43 | 22.5 | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 19 : i32] | tensor<[1,28,28,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[19,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 1 : i32, 1 : i32] | tensor<[19,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 19 : i32] | tensor<[1,28,28,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 1 : i32, 1 : i32] | tensor<[19,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 20.88 | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 20.88 | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 256 : i32] | tensor<[1,28,28,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 0.9 | +| ttnn.reshape | tensor<[1,1,784,38,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 38 : i32] | tensor<[1,28,28,38,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 0.9 | +| ttnn.reshape | tensor<[38,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [38 : i32, 1 : i32, 1 : i32] | tensor<[38,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 0.9 | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,38,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 38 : i32] | tensor<[1,28,28,38,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[38,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [38 : i32, 1 : i32, 1 : i32] | tensor<[38,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.11 | 0.02 | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.11 | 0.02 | +| ttnn.reshape | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 512 : i32] | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 512 : i32] | tensor<[1,28,28,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 512 : i32] | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1024 : i32] | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 512 : i32] | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1024 : i32] | tensor<[1,16,16,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 512 : i32] | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 255 : i32] | tensor<[1,32,32,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[255,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [255 : i32, 1 : i32, 1 : i32] | tensor<[255,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 512 : i32] | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 255 : i32] | tensor<[1,32,32,255,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[255,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [255 : i32, 1 : i32, 1 : i32] | tensor<[255,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 512 : i32] | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 512 : i32] | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 512 : i32] | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.33 | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.33 | +| ttnn.reshape | tensor<[1,32,32,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 512 : i32] | tensor<[1,1,1024,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,45,80,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 512 : i32] | tensor<[1,1,3600,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.78 | 0.7 | +| ttnn.reshape | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 512 : i32] | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.78 | 0.7 | +| ttnn.reshape | tensor<[1,45,80,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3600 : i32, 512 : i32] | tensor<[1,1,3600,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,920,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 512 : i32] | tensor<[1,23,40,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 512 : i32] | tensor<[1,1,3136,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 16, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 256 : i32] | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 512 : i32] | tensor<[1,1,3136,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 256 : i32] | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 512 : i32] | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,25,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 128 : i32] | tensor<[1,5,5,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 512 : i32] | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 128 : i32] | tensor<[1,5,5,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 512 : i32] | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,25,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 24 : i32] | tensor<[1,5,5,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 512 : i32] | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 24 : i32] | tensor<[1,5,5,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 512 : i32] | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 512 : i32] | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 512 : i32] | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 512 : i32] | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 512 : i32] | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.31 | +| ttnn.reshape | tensor<[1,1,25,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 546 : i32] | tensor<[1,5,5,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.31 | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.31 | +| ttnn.reshape | tensor<[1,5,5,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32, 512 : i32] | tensor<[1,1,25,512,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32, 546 : i32] | tensor<[1,5,5,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25 + d1 * 5 + d2, d3), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,60,80,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 512 : i32] | tensor<[1,1,4800,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 9.5 | +| ttnn.reshape | tensor<[1,1,4800,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 512 : i32] | tensor<[1,60,80,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 9.5 | +| ttnn.reshape | tensor<[512,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [512 : i32, 1 : i32, 1 : i32] | tensor<[512,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.25 | 9.5 | +| ttnn.reshape | tensor<[1,60,80,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 512 : i32] | tensor<[1,1,4800,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4800,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 512 : i32] | tensor<[1,60,80,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[512,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [512 : i32, 1 : i32, 1 : i32] | tensor<[512,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 512 : i32] | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.14 | 8.31 | +| ttnn.reshape | tensor<[1,1,49,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 2048 : i32] | tensor<[1,7,7,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | yes | 0.14 | 8.31 | +| ttnn.reshape | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 512 : i32] | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 2048 : i32] | tensor<[1,7,7,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 512 : i32] | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.1830129162485751e+38 | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.1830129162485751e+38 | +| ttnn.reshape | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 512 : i32] | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 512 : i32] | tensor<[1,7,7,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 512 : i32] | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 1024 : i32] | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 512 : i32] | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3600,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 45 : i32, 80 : i32, 1024 : i32] | tensor<[1,45,80,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3600 + d1 * 80 + d2, d3), memory_config: (113, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 512 : i32] | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 512 : i32] | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,14400,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 128 : i32] | tensor<[1,90,160,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,14400,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 256 : i32] | tensor<[1,90,160,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,90,160,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 14400 : i32, 512 : i32] | tensor<[1,1,14400,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,14400,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 90 : i32, 160 : i32, 256 : i32] | tensor<[1,90,160,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14400 + d1 * 160 + d2, d3), memory_config: (450, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,544,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 17, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 544 : i32] | tensor<[1,1,196,544,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 17, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 5.981525981032121e+37 | +| ttnn.reshape | tensor<[1,1,196,196,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 196 : i32] | tensor<[1,14,14,196,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 5.981525981032121e+37 | +| ttnn.reshape | tensor<[1,14,14,544,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 17, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 544 : i32] | tensor<[1,1,196,544,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 17, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,196,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 196 : i32] | tensor<[1,14,14,196,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,54,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 54 : i32] | tensor<[1,1,3136,54,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 6.214140880294482e+37 | +| ttnn.reshape | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 24 : i32] | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 6.214140880294482e+37 | +| ttnn.reshape | tensor<[1,56,56,54,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 54 : i32] | tensor<[1,1,3136,54,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 24 : i32] | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 576 : i32] | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 576 : i32] | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 576 : i32] | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 576 : i32] | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 576 : i32] | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,49,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 576 : i32] | tensor<[1,7,7,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 18, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 576 : i32] | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 576 : i32] | tensor<[1,7,7,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 18, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 576 : i32] | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 96 : i32] | tensor<[1,14,14,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 576 : i32] | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 96 : i32] | tensor<[1,14,14,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 576 : i32] | tensor<[1,1,49,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 18, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 160 : i32] | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,7,7,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 576 : i32] | tensor<[1,1,49,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 18, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 160 : i32] | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 58 : i32] | tensor<[1,1,784,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 1.38 | +| ttnn.reshape | tensor<[1,1,784,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 20 : i32] | tensor<[1,28,28,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 1.38 | +| ttnn.reshape | tensor<[1,28,28,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 58 : i32] | tensor<[1,1,784,58,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 20 : i32] | tensor<[1,28,28,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,62,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 62 : i32] | tensor<[1,1,784,62,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 4.652297985247206e+37 | +| ttnn.reshape | tensor<[1,1,784,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 28 : i32] | tensor<[1,28,28,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 4.652297985247206e+37 | +| ttnn.reshape | tensor<[1,28,28,62,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 62 : i32] | tensor<[1,1,784,62,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 28 : i32] | tensor<[1,28,28,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 640 : i32] | tensor<[1,1,256,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 7.97 | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 7.97 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 7.97 | +| ttnn.reshape | tensor<[1,16,16,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 640 : i32] | tensor<[1,1,256,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 640 : i32] | tensor<[1,1,256,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | yes | 0.78 | 22.75 | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.78 | 22.75 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.78 | 22.75 | +| ttnn.reshape | tensor<[1,16,16,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32, 640 : i32] | tensor<[1,1,256,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 29.75 | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 29.75 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 29.75 | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.16 | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.22 | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.22 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.22 | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.62 | +| ttnn.reshape | tensor<[1,1,256,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 640 : i32] | tensor<[1,16,16,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.62 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.62 | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 640 : i32] | tensor<[1,16,16,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 640 : i32] | tensor<[1,1,4096,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.11 | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.11 | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.11 | +| ttnn.reshape | tensor<[1,64,64,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 640 : i32] | tensor<[1,1,4096,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 640 : i32] | tensor<[1,1,4096,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,64,64,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 640 : i32] | tensor<[1,1,4096,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 640 : i32] | tensor<[1,1,4096,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 640 : i32] | tensor<[1,64,64,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,64,64,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 640 : i32] | tensor<[1,1,4096,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 640 : i32] | tensor<[1,64,64,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 640 : i32] | tensor<[1,1,49,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 20, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 9.570441569651394e+37 | +| ttnn.reshape | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 160 : i32] | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 9.570441569651394e+37 | +| ttnn.reshape | tensor<[1,7,7,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 640 : i32] | tensor<[1,1,49,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 160 : i32] | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 64 : i32] | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.89 | 0.24 | +| ttnn.reshape | tensor<[1,1,12544,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 128 : i32] | tensor<[1,112,112,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.89 | 0.24 | +| ttnn.reshape | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 64 : i32] | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12544,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 112 : i32, 112 : i32, 128 : i32] | tensor<[1,112,112,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 64 : i32] | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.43 | 11.75 | +| ttnn.reshape | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 64 : i32] | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.43 | 11.75 | +| ttnn.reshape | tensor<[1,112,112,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 64 : i32] | tensor<[1,1,12544,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 64 : i32] | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 64 : i32] | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 128 : i32] | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 64 : i32] | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4800,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 128 : i32] | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 64 : i32] | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.31 | +| ttnn.reshape | tensor<[1,1,19200,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 32 : i32] | tensor<[1,120,160,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.31 | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.31 | +| ttnn.reshape | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 64 : i32] | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 32 : i32] | tensor<[1,120,160,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 64 : i32] | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 122.5 | +| ttnn.reshape | tensor<[1,1,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 64 : i32] | tensor<[1,15,20,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 122.5 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 122.5 | +| ttnn.reshape | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19200 : i32, 64 : i32] | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 64 : i32] | tensor<[1,15,20,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 64 : i32] | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.38 | +| ttnn.reshape | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 128 : i32] | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.38 | +| ttnn.reshape | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 64 : i32] | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 128 : i32] | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 64 : i32] | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16384 : i32, 64 : i32] | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 64 : i32] | tensor<[1,128,128,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 64 : i32] | tensor<[1,1,196,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 384 : i32] | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,14,14,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 64 : i32] | tensor<[1,1,196,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 384 : i32] | tensor<[1,14,14,384,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,160,160,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25600 : i32, 64 : i32] | tensor<[1,1,25600,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,1,6400,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 64 : i32] | tensor<[1,80,80,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,160,160,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 25600 : i32, 64 : i32] | tensor<[1,1,25600,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 25600 + d1 * 160 + d2, d3), memory_config: (800, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,6400,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 64 : i32] | tensor<[1,80,80,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 64 : i32] | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,57600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 256 : i32] | tensor<[1,180,320,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 64 : i32] | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,57600,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 256 : i32] | tensor<[1,180,320,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 64 : i32] | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 64 : i32] | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 64 : i32] | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 64 : i32] | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 64 : i32] | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 57600 : i32, 64 : i32] | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,57600,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 180 : i32, 320 : i32, 64 : i32] | tensor<[1,180,320,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 57600 + d1 * 320 + d2, d3), memory_config: (1800, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 64 : i32] | tensor<[1,1,1,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 64 : i32] | tensor<[1,1,1,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 128 : i32] | tensor<[1,1,1,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 64 : i32] | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,50176,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 224 : i32, 224 : i32, 1 : i32] | tensor<[1,224,224,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 64 : i32] | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,50176,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 224 : i32, 224 : i32, 1 : i32] | tensor<[1,224,224,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 64 : i32] | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 224 : i32, 224 : i32, 64 : i32] | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 50176 : i32, 64 : i32] | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,50176,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 224 : i32, 224 : i32, 64 : i32] | tensor<[1,224,224,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 50176 + d1 * 224 + d2, d3), memory_config: (1568, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 64 : i32] | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.5 | +| ttnn.reshape | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 128 : i32] | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.5 | +| ttnn.reshape | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 64 : i32] | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 128 : i32] | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 64 : i32] | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 32 : i32] | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 64 : i32] | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 32 : i32] | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 64 : i32] | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 32 : i32] | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,256,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 65536 : i32, 64 : i32] | tensor<[1,1,65536,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,65536,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32, 32 : i32] | tensor<[1,256,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 65536 + d1 * 256 + d2, d3), memory_config: (2048, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,2,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 64 : i32] | tensor<[1,1,4,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 64 : i32] | tensor<[1,1,1,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,2,2,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 4 : i32, 64 : i32] | tensor<[1,1,4,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4 + d1 * 2 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 64 : i32] | tensor<[1,1,1,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,30,40,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 64 : i32] | tensor<[1,1,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 4.918143584404189e+37 | +| ttnn.reshape | tensor<[1,1,1200,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 32 : i32] | tensor<[1,30,40,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 4.918143584404189e+37 | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 4.918143584404189e+37 | +| ttnn.reshape | tensor<[1,30,40,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1200 : i32, 64 : i32] | tensor<[1,1,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1200,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 32 : i32] | tensor<[1,30,40,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,480,640,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 307200 : i32, 64 : i32] | tensor<[1,1,307200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,307200,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 480 : i32, 640 : i32, 1 : i32] | tensor<[1,480,640,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,480,640,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 307200 : i32, 64 : i32] | tensor<[1,1,307200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,307200,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 480 : i32, 640 : i32, 1 : i32] | tensor<[1,480,640,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,480,640,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 307200 : i32, 64 : i32] | tensor<[1,1,307200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,307200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 480 : i32, 640 : i32, 64 : i32] | tensor<[1,480,640,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,480,640,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 307200 : i32, 64 : i32] | tensor<[1,1,307200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,307200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 480 : i32, 640 : i32, 64 : i32] | tensor<[1,480,640,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 307200 + d1 * 640 + d2, d3), memory_config: (9600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.33 | 21.88 | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.33 | 21.88 | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 128 : i32] | tensor<[1,56,56,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.57 | 2.28 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.57 | 2.28 | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 1.2494743160378207e+38 | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 1.2494743160378207e+38 | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 128 : i32] | tensor<[1,28,28,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 5.4469361728556725e+29 | +| ttnn.reshape | tensor<[1,1,3136,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 14 : i32] | tensor<[1,56,56,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 5.4469361728556725e+29 | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 14 : i32] | tensor<[1,56,56,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 256 : i32] | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 256 : i32] | tensor<[1,56,56,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.56 | 85.0 | +| ttnn.reshape | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 64 : i32] | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.56 | 85.0 | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 64 : i32] | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.58 | 3.89 | +| ttnn.reshape | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 64 : i32] | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.58 | 3.89 | +| ttnn.reshape | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 64 : i32] | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 64 : i32] | tensor<[1,56,56,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,60,80,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 64 : i32] | tensor<[1,1,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.64 | 8.25 | +| ttnn.reshape | tensor<[1,1,4800,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 32 : i32] | tensor<[1,60,80,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.64 | 8.25 | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.64 | 8.25 | +| ttnn.reshape | tensor<[1,60,80,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4800 : i32, 64 : i32] | tensor<[1,1,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4800,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 32 : i32] | tensor<[1,60,80,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 64 : i32] | tensor<[1,1,4096,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 64 : i32] | tensor<[1,1,4096,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 128 : i32] | tensor<[1,64,64,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 64 : i32] | tensor<[1,1,4096,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[1,1,1024,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 160 : i32] | tensor<[1,32,32,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[160,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, bf16>', 'dram') | shape: [160 : i32, 1 : i32, 1 : i32] | tensor<[160,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.12 | +| ttnn.reshape | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 64 : i32] | tensor<[1,1,4096,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 160 : i32] | tensor<[1,32,32,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[160,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, bf16>', 'dram') | shape: [160 : i32, 1 : i32, 1 : i32] | tensor<[160,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 64 : i32] | tensor<[1,1,4096,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 32.75 | +| ttnn.reshape | tensor<[1,1,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 64 : i32] | tensor<[1,16,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 32.75 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 32.75 | +| ttnn.reshape | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 64 : i32] | tensor<[1,1,4096,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 64 : i32] | tensor<[1,16,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,80,80,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 64 : i32] | tensor<[1,1,6400,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 2, 'tile<32x32, f32>', 'dram') | yes | 0.68 | 7.57 | +| ttnn.reshape | tensor<[1,1,6400,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 24 : i32] | tensor<[1,80,80,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | yes | 0.68 | 7.57 | +| ttnn.reshape | tensor<[1,80,80,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 64 : i32] | tensor<[1,1,6400,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,6400,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 24 : i32] | tensor<[1,80,80,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,654,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 21, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 654 : i32] | tensor<[1,1,196,654,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 21, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 1.82 | +| ttnn.reshape | tensor<[1,1,196,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 640 : i32] | tensor<[1,14,14,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 20, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 1.82 | +| ttnn.reshape | tensor<[1,14,14,654,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 21, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 654 : i32] | tensor<[1,1,196,654,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 21, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 640 : i32] | tensor<[1,14,14,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,10,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 672 : i32] | tensor<[1,1,100,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 21, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,10,10,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 672 : i32] | tensor<[1,1,100,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 80 : i32] | tensor<[1,10,10,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[168,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [168 : i32, 1 : i32, 1 : i32] | tensor<[168,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,1,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 672 : i32] | tensor<[1,1,1,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,168,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 168 : i32] | tensor<[1,1,1,168,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[168,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [168 : i32, 1 : i32, 1 : i32] | tensor<[168,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 112 : i32] | tensor<[1,20,20,112,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,1,400,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 24 : i32] | tensor<[1,20,20,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.06 | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 24 : i32] | tensor<[1,20,20,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.31 | +| ttnn.reshape | tensor<[1,1,400,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 546 : i32] | tensor<[1,20,20,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 18, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.31 | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 18, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 546 : i32] | tensor<[1,20,20,546,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 18, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[546,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [546 : i32, 1 : i32, 1 : i32] | tensor<[546,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 672 : i32] | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 672 : i32] | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,100,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 672 : i32] | tensor<[1,10,10,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 21, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,20,20,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 672 : i32] | tensor<[1,1,400,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 672 : i32] | tensor<[1,10,10,672,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 21, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 68 : i32] | tensor<[1,1,196,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.28 | 1.45 | +| ttnn.reshape | tensor<[1,1,196,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 40 : i32] | tensor<[1,14,14,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.28 | 1.45 | +| ttnn.reshape | tensor<[1,14,14,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 68 : i32] | tensor<[1,1,196,68,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 40 : i32] | tensor<[1,14,14,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 72 : i32] | tensor<[1,1,1,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 24 : i32] | tensor<[1,1,1,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,40,40,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 72 : i32] | tensor<[1,1,1600,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,1600,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 40 : i32] | tensor<[1,40,40,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,40,40,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1600 : i32, 72 : i32] | tensor<[1,1,1600,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1600,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 40 : i32] | tensor<[1,40,40,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 72 : i32] | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,6400,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 24 : i32] | tensor<[1,80,80,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 72 : i32] | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,6400,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 24 : i32] | tensor<[1,80,80,24,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 72 : i32] | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 72 : i32] | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.03 | +| ttnn.reshape | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 72 : i32] | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 80 : i32, 80 : i32, 72 : i32] | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 72 : i32] | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.08 | +| ttnn.reshape | tensor<[1,1,1600,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 72 : i32] | tensor<[1,40,40,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.08 | +| ttnn.reshape | tensor<[1,80,80,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 6400 : i32, 72 : i32] | tensor<[1,1,6400,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6400 + d1 * 80 + d2, d3), memory_config: (200, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1600,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 40 : i32, 40 : i32, 72 : i32] | tensor<[1,40,40,72,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1600 + d1 * 40 + d2, d3), memory_config: (50, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,740,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 740 : i32] | tensor<[1,1,196,740,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 1.1497822163539522e+38 | +| ttnn.reshape | tensor<[1,1,196,334,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 11, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 334 : i32] | tensor<[1,14,14,334,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 11, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 1.1497822163539522e+38 | +| ttnn.reshape | tensor<[1,14,14,740,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 740 : i32] | tensor<[1,1,196,740,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,334,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 11, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 334 : i32] | tensor<[1,14,14,334,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 11, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 768 : i32] | tensor<[1,1,1024,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.19 | +| ttnn.reshape | tensor<[1,32,32,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 768 : i32] | tensor<[1,1,1024,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 256 : i32] | tensor<[1,32,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,782,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 25, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 782 : i32] | tensor<[1,1,49,782,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 25, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,49,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 1024 : i32] | tensor<[1,7,7,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 32, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,7,7,782,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 25, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 782 : i32] | tensor<[1,1,49,782,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 25, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 1024 : i32] | tensor<[1,7,7,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 78 : i32] | tensor<[1,1,784,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.51 | 1.58 | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.51 | 1.58 | +| ttnn.reshape | tensor<[1,28,28,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 78 : i32] | tensor<[1,1,784,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 16 : i32] | tensor<[1,28,28,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 78 : i32] | tensor<[1,1,784,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.48 | 1.18 | +| ttnn.reshape | tensor<[1,1,784,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 34 : i32] | tensor<[1,28,28,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.48 | 1.18 | +| ttnn.reshape | tensor<[1,28,28,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 78 : i32] | tensor<[1,1,784,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 34 : i32] | tensor<[1,28,28,34,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 78 : i32] | tensor<[1,1,3136,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.34 | 5.31 | +| ttnn.reshape | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 24 : i32] | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.34 | 5.31 | +| ttnn.reshape | tensor<[1,56,56,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 78 : i32] | tensor<[1,1,3136,78,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 24 : i32] | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,800,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 25, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 800 : i32] | tensor<[1,1,49,800,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 25, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.136489936396103e+38 | +| ttnn.reshape | tensor<[1,1,49,272,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 272 : i32] | tensor<[1,7,7,272,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 9, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.136489936396103e+38 | +| ttnn.reshape | tensor<[1,7,7,800,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 25, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 800 : i32] | tensor<[1,1,49,800,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 25, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,272,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 272 : i32] | tensor<[1,7,7,272,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 9, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,10,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 80 : i32] | tensor<[1,1,100,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 480 : i32] | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.02 | +| ttnn.reshape | tensor<[1,10,10,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 100 : i32, 80 : i32] | tensor<[1,1,100,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 10 : i32, 480 : i32] | tensor<[1,10,10,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 10 + d2, d3), memory_config: (4, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 80 : i32] | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[1,1,400,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 184 : i32] | tensor<[1,20,20,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.05 | +| ttnn.reshape | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 80 : i32] | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 184 : i32] | tensor<[1,20,20,184,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 6, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 80 : i32] | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,1,400,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 200 : i32] | tensor<[1,20,20,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 80 : i32] | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 200 : i32] | tensor<[1,20,20,200,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 7, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 80 : i32] | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.08 | +| ttnn.reshape | tensor<[1,1,400,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 480 : i32] | tensor<[1,20,20,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.08 | +| ttnn.reshape | tensor<[1,20,20,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 400 : i32, 80 : i32] | tensor<[1,1,400,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,400,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32, 20 : i32, 480 : i32] | tensor<[1,20,20,480,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 400 + d1 * 20 + d2, d3), memory_config: (13, 15, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,94,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 94 : i32] | tensor<[1,1,784,94,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.12 | 2.28 | +| ttnn.reshape | tensor<[1,1,784,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 28 : i32] | tensor<[1,28,28,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.12 | 2.28 | +| ttnn.reshape | tensor<[1,28,28,94,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 94 : i32] | tensor<[1,1,784,94,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 28 : i32] | tensor<[1,28,28,28,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 960 : i32] | tensor<[1,1,1024,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 30, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.28 | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.28 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.28 | +| ttnn.reshape | tensor<[1,32,32,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 960 : i32] | tensor<[1,1,1024,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 960 : i32] | tensor<[1,1,1024,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 30, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.46 | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.46 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.46 | +| ttnn.reshape | tensor<[1,32,32,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32, 960 : i32] | tensor<[1,1,1024,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 960 : i32] | tensor<[1,1,4096,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 30, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.34 | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.34 | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.34 | +| ttnn.reshape | tensor<[1,64,64,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 960 : i32] | tensor<[1,1,4096,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 960 : i32] | tensor<[1,1,4096,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 30, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,64,64,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 4096 : i32, 960 : i32] | tensor<[1,1,4096,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 960 : i32] | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | yes | 0.12 | 5.88 | +| ttnn.reshape | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 160 : i32] | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.12 | 5.88 | +| ttnn.reshape | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 960 : i32] | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 160 : i32] | tensor<[1,7,7,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 960 : i32] | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,49,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 320 : i32] | tensor<[1,7,7,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 960 : i32] | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 320 : i32] | tensor<[1,7,7,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 960 : i32] | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 1.9406728738459768e+38 | +| ttnn.reshape | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 960 : i32] | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 1.9406728738459768e+38 | +| ttnn.reshape | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 49 : i32, 960 : i32] | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,49,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32, 960 : i32] | tensor<[1,7,7,960,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 49 + d1 * 7 + d2, d3), memory_config: (2, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,112,112,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 96 : i32] | tensor<[1,1,12544,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.72 | 11.5 | +| ttnn.reshape | tensor<[1,1,3136,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 96 : i32] | tensor<[1,56,56,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.72 | 11.5 | +| ttnn.reshape | tensor<[1,112,112,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12544 : i32, 96 : i32] | tensor<[1,1,12544,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12544 + d1 * 112 + d2, d3), memory_config: (392, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 96 : i32] | tensor<[1,56,56,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,14,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 96 : i32] | tensor<[1,1,196,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 22.25 | +| ttnn.reshape | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 576 : i32] | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 22.25 | +| ttnn.reshape | tensor<[1,14,14,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 196 : i32, 96 : i32] | tensor<[1,1,196,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,196,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32, 576 : i32] | tensor<[1,14,14,576,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 196 + d1 * 14 + d2, d3), memory_config: (7, 18, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,56,56,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 96 : i32] | tensor<[1,1,3136,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.28 | 26.75 | +| ttnn.reshape | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 24 : i32] | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.28 | 26.75 | +| ttnn.reshape | tensor<[1,56,56,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 3136 : i32, 96 : i32] | tensor<[1,1,3136,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3136,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 56 : i32, 56 : i32, 24 : i32] | tensor<[1,56,56,24,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3136 + d1 * 56 + d2, d3), memory_config: (98, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,28,28,98,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 98 : i32] | tensor<[1,1,784,98,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 0.85 | +| ttnn.reshape | tensor<[1,1,784,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 20 : i32] | tensor<[1,28,28,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 0.85 | +| ttnn.reshape | tensor<[1,28,28,98,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 784 : i32, 98 : i32] | tensor<[1,1,784,98,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,784,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 28 : i32, 28 : i32, 20 : i32] | tensor<[1,28,28,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 784 + d1 * 28 + d2, d3), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 1 : i32] | tensor<[1,10,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 1 : i32] | tensor<[1,10,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 1 : i32] | tensor<[1,10,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 1 : i32] | tensor<[1,10,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 1 : i32] | tensor<[1,10,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32, 1 : i32] | tensor<[1,10,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,120,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 120 : i32, 1 : i32, 1 : i32] | tensor<[1,120,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,120,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 120 : i32, 1 : i32, 1 : i32] | tensor<[1,120,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 1 : i32, 1 : i32] | tensor<[1,1280,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 1 : i32, 1 : i32] | tensor<[1,1280,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 15 : i32, 1 : i32] | tensor<[1,15,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,15,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 15 : i32, 1 : i32] | tensor<[1,15,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32] | tensor<[1,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 1 : i32, 1 : i32] | tensor<[1,2048,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 1 : i32, 1 : i32] | tensor<[1,2048,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 1 : i32] | tensor<[1,32,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,32,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 1 : i32] | tensor<[1,32,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,480,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 480 : i32, 1 : i32, 1 : i32] | tensor<[1,480,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 480 + d1 + d2, d3), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,480,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 480 : i32, 1 : i32, 1 : i32] | tensor<[1,480,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 480 + d1 + d2, d3), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,480,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 480 : i32, 1 : i32, 1 : i32] | tensor<[1,480,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 480 + d1 + d2, d3), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,480,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 480 : i32, 1 : i32, 1 : i32] | tensor<[1,480,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 480 + d1 + d2, d3), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 1 : i32, 1 : i32] | tensor<[1,512,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 1 : i32, 1 : i32] | tensor<[1,512,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,672,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 672 : i32, 1 : i32, 1 : i32] | tensor<[1,672,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 672 + d1 + d2, d3), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,672,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 672 : i32, 1 : i32, 1 : i32] | tensor<[1,672,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 672 + d1 + d2, d3), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,672,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 672 : i32, 1 : i32, 1 : i32] | tensor<[1,672,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 672 + d1 + d2, d3), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,672,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 672 : i32, 1 : i32, 1 : i32] | tensor<[1,672,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 672 + d1 + d2, d3), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,72,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 72 : i32, 1 : i32, 1 : i32] | tensor<[1,72,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,72,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 72 : i32, 1 : i32, 1 : i32] | tensor<[1,72,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 2 : i32] | tensor<[1,6,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,6,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 2 : i32] | tensor<[1,6,1,2,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,6,1,2,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [6 : i32, 1 : i32, 2 : i32] | tensor<[6,1,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,1,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [6 : i32, 2 : i32] | tensor<[6,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[6,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 2 : i32] | tensor<[1,6,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 2 : i32] | tensor<[1,6,1,2,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1,2,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [6 : i32, 1 : i32, 2 : i32] | tensor<[6,1,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [6 : i32, 2 : i32] | tensor<[6,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[196,196,1,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 196 + d1, d2), memory_config: (1201, 1, 'tile<32x32, u32>', 'dram') | shape: [196 : i32, 196 : i32] | tensor<[196,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 7, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[196,196,1,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 196 + d1, d2), memory_config: (1201, 1, 'tile<32x32, u32>', 'dram') | shape: [196 : i32, 196 : i32] | tensor<[196,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 7, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[196,196,1,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 196 + d1, d2), memory_config: (1201, 1, 'tile<32x32, u32>', 'dram') | shape: [196 : i32, 196 : i32] | tensor<[196,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 7, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[196,196,1,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 196 + d1, d2), memory_config: (1201, 1, 'tile<32x32, u32>', 'dram') | shape: [196 : i32, 196 : i32] | tensor<[196,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 7, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [197 : i32] | tensor<[197,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [197 : i32] | tensor<[197,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, u32>', 'dram') | shape: [197 : i32] | tensor<[197,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[197,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, u32>', 'dram') | shape: [197 : i32] | tensor<[197,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32] | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32] | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12,16,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 * 12 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32] | tensor<[1,12,16,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,12,16,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 * 12 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32] | tensor<[1,12,16,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,23,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 23 + d1 * 23 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32] | tensor<[1,23,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 23 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,23,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 23 + d1 * 23 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32] | tensor<[1,23,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 23 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32] | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32] | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 120 + d2, d3), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32] | tensor<[1,120,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 120 + d1, d2), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 120 + d2, d3), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32] | tensor<[1,120,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 120 + d1, d2), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 120 + d2, d3), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32] | tensor<[1,120,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 120 + d1, d2), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30 + d1 * 30 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32] | tensor<[1,30,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 30 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.66 | 0.48 | +| ttnn.reshape | tensor<[1,1,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30 + d1 * 30 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32] | tensor<[1,30,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 30 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30 + d1 * 30 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32] | tensor<[1,30,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 30 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.31 | 0.76 | +| ttnn.reshape | tensor<[1,1,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30 + d1 * 30 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32] | tensor<[1,30,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 30 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 60 + d1 * 60 + d2, d3), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32] | tensor<[1,60,80,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 60 + d1, d2), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 0.71 | +| ttnn.reshape | tensor<[1,1,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 60 + d1 * 60 + d2, d3), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32] | tensor<[1,60,80,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 60 + d1, d2), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 60 + d1 * 60 + d2, d3), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32] | tensor<[1,60,80,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 60 + d1, d2), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | yes | -0.27 | 0.82 | +| ttnn.reshape | tensor<[1,1,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 60 + d1 * 60 + d2, d3), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32] | tensor<[1,60,80,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 60 + d1, d2), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,16,1,96,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 512 + d1 * 16 + d2 + d3, d4), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 16 : i32, 96 : i32] | tensor<[1,32,16,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 16 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,16,1,96,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 512 + d1 * 16 + d2 + d3, d4), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 16 : i32, 96 : i32] | tensor<[1,32,16,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 16 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,16,1,96,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 512 + d1 * 16 + d2 + d3, d4), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 16 : i32, 96 : i32] | tensor<[1,32,16,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 16 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,16,1,96,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 512 + d1 * 16 + d2 + d3, d4), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 16 : i32, 96 : i32] | tensor<[1,32,16,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 16 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,320,320,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 320 + d2, d3), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 320 : i32, 320 : i32] | tensor<[3,320,320,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,3,320,320,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 320 + d2, d3), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 320 : i32, 320 : i32] | tensor<[3,320,320,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,720,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2160 + d1 * 720 + d2, d3), memory_config: (68, 40, 'tile<32x32, bf16>', 'dram') | shape: [3 : i32, 720 : i32, 1280 : i32] | tensor<[3,720,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 720 + d1, d2), memory_config: (68, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,3,720,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2160 + d1 * 720 + d2, d3), memory_config: (68, 40, 'tile<32x32, bf16>', 'dram') | shape: [3 : i32, 720 : i32, 1280 : i32] | tensor<[3,720,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 720 + d1, d2), memory_config: (68, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 192 : i32] | tensor<[1,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 192 : i32] | tensor<[1,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,45,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 2, 'tile<32x32, u32>', 'dram') | shape: [45 : i32] | tensor<[45,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,45,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 2, 'tile<32x32, u32>', 'dram') | shape: [45 : i32] | tensor<[45,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32] | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [5 : i32] | tensor<[5,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [5 : i32] | tensor<[5,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [6 : i32] | tensor<[6,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,6,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [6 : i32] | tensor<[6,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,720,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 720 + d1, d2), memory_config: (23, 40, 'tile<32x32, bf16>', 'dram') | shape: [720 : i32, 1280 : i32] | tensor<[720,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 40, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,720,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 720 + d1, d2), memory_config: (23, 40, 'tile<32x32, bf16>', 'dram') | shape: [720 : i32, 1280 : i32] | tensor<[720,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32] | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32] | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32] | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32] | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32] | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32] | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32] | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32] | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32] | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32] | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 4 : i32] | tensor<[1,100,4,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 3.2 | +| ttnn.reshape | tensor<[1,1,100,4,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 4 : i32] | tensor<[1,100,4,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,100,92,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (4, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 92 : i32] | tensor<[1,100,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 3, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,100,92,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (4, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 92 : i32] | tensor<[1,100,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,320,320,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 320 : i32, 320 : i32] | tensor<[1,3,320,320,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 320 + d2, d3), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[3,320,320,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 320 : i32, 320 : i32] | tensor<[1,3,320,320,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 320 + d2, d3), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,720,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 720 + d1, d2), memory_config: (68, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 3 : i32, 720 : i32, 1280 : i32] | tensor<[1,3,720,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2160 + d1 * 720 + d2, d3), memory_config: (68, 40, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[3,720,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 720 + d1, d2), memory_config: (68, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 3 : i32, 720 : i32, 1280 : i32] | tensor<[1,3,720,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2160 + d1 * 720 + d2, d3), memory_config: (68, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32] | tensor<[1,14,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.21 | 10.06 | +| ttnn.reshape | tensor<[1,14,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32] | tensor<[1,14,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [19 : i32] | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [19 : i32] | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,480,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 480 + d1 * 480 + d2, d3), memory_config: (15, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 480 : i32, 640 : i32] | tensor<[1,480,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 480 + d1, d2), memory_config: (15, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,480,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 480 + d1 * 480 + d2, d3), memory_config: (15, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 480 : i32, 640 : i32] | tensor<[1,480,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 480 + d1, d2), memory_config: (15, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32] | tensor<[1,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.11 | 949978046398464.0 | +| ttnn.reshape | tensor<[1,256,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32] | tensor<[1,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,25,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32] | tensor<[1,25,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.16 | 12.06 | +| ttnn.reshape | tensor<[1,25,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32] | tensor<[1,25,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 16 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 16 : i32, 16 : i32, 1 : i32] | tensor<[1,5,16,16,1,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1280 + d1 * 256 + d2 * 16 + d3, d4), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 16 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 16 : i32, 16 : i32, 1 : i32] | tensor<[1,5,16,16,1,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1280 + d1 * 256 + d2 * 16 + d3, d4), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 1 : i32, 256 : i32] | tensor<[1,100,1,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 + d2, d3), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[100,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 1 : i32, 256 : i32] | tensor<[1,100,1,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 + d2, d3), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [100 : i32, 1 : i32] | tensor<[100,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[100,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [100 : i32, 1 : i32] | tensor<[100,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,16,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [12 : i32, 16 : i32, 1 : i32] | tensor<[12,16,1,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[12,16,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [12 : i32, 16 : i32, 1 : i32] | tensor<[12,16,1,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,14,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32] | tensor<[1,14,14,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[14,14,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 14 : i32, 14 : i32] | tensor<[1,14,14,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[25,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [25 : i32, 1 : i32] | tensor<[25,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[25,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [25 : i32, 1 : i32] | tensor<[25,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 1 : i32, 1 : i32] | tensor<[3234,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 1 : i32, 1 : i32] | tensor<[3234,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 2 : i32, 1 : i32] | tensor<[3234,2,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2 + d1, d2), memory_config: (203, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[3234,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 2 : i32, 1 : i32] | tensor<[3234,2,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2 + d1, d2), memory_config: (203, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[400,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 13, 'tile<32x32, f32>', 'dram') | shape: [400 : i32, 1 : i32] | tensor<[400,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (13, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[400,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 13, 'tile<32x32, f32>', 'dram') | shape: [400 : i32, 1 : i32] | tensor<[400,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (13, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [4 : i32, 1 : i32] | tensor<[4,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[4,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [4 : i32, 1 : i32] | tensor<[4,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [9 : i32, 1 : i32] | tensor<[9,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[9,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [9 : i32, 1 : i32] | tensor<[9,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [19 : i32, 1 : i32] | tensor<[19,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[19,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [19 : i32, 1 : i32] | tensor<[19,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 1 : i32] | tensor<[2,1,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[2,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 1 : i32] | tensor<[2,1,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1024,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 32, 'tile<32x32, f32>', 'dram') | shape: [1024 : i32, 1 : i32] | tensor<[1024,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1024,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 32, 'tile<32x32, f32>', 'dram') | shape: [1024 : i32, 1 : i32] | tensor<[1024,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1024,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [1024 : i32, 1 : i32, 1 : i32] | tensor<[1024,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1024,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [1024 : i32, 1 : i32, 1 : i32] | tensor<[1024,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 10 : i32] | tensor<[1,10,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.37 | nan | +| ttnn.reshape | tensor<[10,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 10 : i32] | tensor<[1,10,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [10 : i32, 1 : i32] | tensor<[10,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.37 | nan | +| ttnn.reshape | tensor<[10,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [10 : i32, 1 : i32] | tensor<[10,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[112,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [112 : i32, 1 : i32] | tensor<[112,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[112,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [112 : i32, 1 : i32] | tensor<[112,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[112,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [112 : i32, 1 : i32, 1 : i32] | tensor<[112,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[112,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [112 : i32, 1 : i32, 1 : i32] | tensor<[112,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[116,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [116 : i32, 1 : i32] | tensor<[116,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[116,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [116 : i32, 1 : i32] | tensor<[116,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[116,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [116 : i32, 1 : i32, 1 : i32] | tensor<[116,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[116,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [116 : i32, 1 : i32, 1 : i32] | tensor<[116,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[120,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32] | tensor<[120,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[120,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32] | tensor<[120,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[120,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32, 1 : i32] | tensor<[120,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[120,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32, 1 : i32] | tensor<[120,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, f32>', 'dram') | shape: [1280 : i32, 1 : i32] | tensor<[1280,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1280,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, f32>', 'dram') | shape: [1280 : i32, 1 : i32] | tensor<[1280,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32] | tensor<[1,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1280,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32] | tensor<[1,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1280,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1280,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | shape: [1280 : i32, 1 : i32, 1 : i32] | tensor<[1280,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [128 : i32, 1 : i32] | tensor<[128,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[128,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [128 : i32, 1 : i32] | tensor<[128,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[128,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [128 : i32, 1 : i32, 1 : i32] | tensor<[128,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [12 : i32, 1 : i32] | tensor<[12,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.38 | nan | +| ttnn.reshape | tensor<[12,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [12 : i32, 1 : i32] | tensor<[12,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 10 : i32] | tensor<[1,12,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 30.75 | +| ttnn.reshape | tensor<[12,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 10 : i32] | tensor<[1,12,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,16,2,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32, 2 : i32] | tensor<[1,12,16,2,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | yes | -0.03 | nan | +| ttnn.reshape | tensor<[12,16,2,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32, 2 : i32] | tensor<[1,12,16,2,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 197 : i32] | tensor<[1,12,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.23 | 6.19 | +| ttnn.reshape | tensor<[12,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 197 : i32] | tensor<[1,12,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 1 : i32] | tensor<[1,12,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 22.0 | +| ttnn.reshape | tensor<[12,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 1 : i32, 1 : i32] | tensor<[1,12,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[134,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | shape: [134 : i32, 1 : i32] | tensor<[134,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[134,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | shape: [134 : i32, 1 : i32] | tensor<[134,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[134,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [134 : i32, 1 : i32, 1 : i32] | tensor<[134,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[134,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [134 : i32, 1 : i32, 1 : i32] | tensor<[134,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[144,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | shape: [144 : i32, 1 : i32] | tensor<[144,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[144,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | shape: [144 : i32, 1 : i32] | tensor<[144,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[144,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [144 : i32, 1 : i32, 1 : i32] | tensor<[144,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[144,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [144 : i32, 1 : i32, 1 : i32] | tensor<[144,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [14 : i32, 1 : i32] | tensor<[14,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[14,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [14 : i32, 1 : i32] | tensor<[14,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [14 : i32, 1 : i32, 1 : i32] | tensor<[14,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[14,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [14 : i32, 1 : i32, 1 : i32] | tensor<[14,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[15,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 15 : i32] | tensor<[1,15,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.54 | nan | +| ttnn.reshape | tensor<[15,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 15 : i32] | tensor<[1,15,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[15,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [15 : i32, 1 : i32] | tensor<[15,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.54 | nan | +| ttnn.reshape | tensor<[15,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [15 : i32, 1 : i32] | tensor<[15,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[160,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | shape: [160 : i32, 1 : i32] | tensor<[160,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[160,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | shape: [160 : i32, 1 : i32] | tensor<[160,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[160,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32] | tensor<[1,160,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[160,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 160 : i32] | tensor<[1,160,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 5, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[160,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [160 : i32, 1 : i32, 1 : i32] | tensor<[160,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[160,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [160 : i32, 1 : i32, 1 : i32] | tensor<[160,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[168,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [168 : i32, 1 : i32] | tensor<[168,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[168,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [168 : i32, 1 : i32] | tensor<[168,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[168,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [168 : i32, 1 : i32, 1 : i32] | tensor<[168,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[168,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [168 : i32, 1 : i32, 1 : i32] | tensor<[168,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 1 : i32] | tensor<[16,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 1 : i32] | tensor<[16,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 1 : i32] | tensor<[16,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 10 : i32, 10 : i32] | tensor<[1,16,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 42.5 | +| ttnn.reshape | tensor<[16,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 10 : i32, 10 : i32] | tensor<[1,16,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 197 : i32, 197 : i32] | tensor<[1,16,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.24 | 7.41 | +| ttnn.reshape | tensor<[16,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 197 : i32, 197 : i32] | tensor<[1,16,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 1 : i32, 1 : i32] | tensor<[16,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 1 : i32, 1 : i32] | tensor<[16,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 1 : i32] | tensor<[1,16,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.11 | 11.88 | +| ttnn.reshape | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 1 : i32, 1 : i32] | tensor<[1,16,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[184,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [184 : i32, 1 : i32] | tensor<[184,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[184,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [184 : i32, 1 : i32] | tensor<[184,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[184,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [184 : i32, 1 : i32, 1 : i32] | tensor<[184,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[184,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [184 : i32, 1 : i32, 1 : i32] | tensor<[184,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1920,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1920 : i32] | tensor<[1,1920,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 60, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1920,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1920 : i32] | tensor<[1,1920,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 60, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[192,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [192 : i32, 1 : i32] | tensor<[192,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[192,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 6, 'tile<32x32, f32>', 'dram') | shape: [192 : i32, 1 : i32] | tensor<[192,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[192,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [192 : i32, 1 : i32, 1 : i32] | tensor<[192,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[192,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [192 : i32, 1 : i32, 1 : i32] | tensor<[192,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[196,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [196 : i32, 1 : i32] | tensor<[196,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[196,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [196 : i32, 1 : i32] | tensor<[196,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[196,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | shape: [196 : i32, 1 : i32, 1 : i32] | tensor<[196,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[196,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | shape: [196 : i32, 1 : i32, 1 : i32] | tensor<[196,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 19 : i32] | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.52 | nan | +| ttnn.reshape | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 19 : i32] | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [19 : i32, 1 : i32] | tensor<[19,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | -0.1 | nan | +| ttnn.reshape | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [19 : i32, 1 : i32] | tensor<[19,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,19,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 19 : i32] | tensor<[1,19,19,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.38 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[19,19,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 19 : i32] | tensor<[1,19,19,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 10 : i32] | tensor<[1,1,10,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,10,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 10 : i32] | tensor<[1,1,10,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,120,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 120 + d1, d2), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 120 : i32, 160 : i32] | tensor<[1,1,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 120 + d2, d3), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,120,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 120 + d1, d2), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 120 : i32, 160 : i32] | tensor<[1,1,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 120 + d2, d3), memory_config: (4, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 1 : i32] | tensor<[1,1280,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 1 : i32] | tensor<[1,1280,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 1 : i32, 1 : i32] | tensor<[1,1280,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | inf | +| ttnn.reshape | tensor<[1,1280,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 1 : i32, 1 : i32] | tensor<[1,1280,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 12 : i32] | tensor<[1,1,12,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,12,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 12 : i32] | tensor<[1,1,12,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,16,2,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 12 : i32, 16 : i32, 2 : i32] | tensor<[1,1,12,16,2,i32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 192 + d1 * 192 + d2 * 16 + d3, d4), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | yes | -0.03 | nan | +| ttnn.reshape | tensor<[1,12,16,2,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 12 : i32, 16 : i32, 2 : i32] | tensor<[1,1,12,16,2,i32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 192 + d1 * 192 + d2 * 16 + d3, d4), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 14 : i32] | tensor<[1,1,14,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,14,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 14 : i32] | tensor<[1,1,14,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 15 : i32] | tensor<[1,1,15,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,15,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 15 : i32] | tensor<[1,1,15,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 16 : i32] | tensor<[1,1,16,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,16,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 16 : i32] | tensor<[1,1,16,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1920,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1920 : i32, 1 : i32] | tensor<[1,1920,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1920 + d1, d2), memory_config: (60, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1920,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 60, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1920 : i32, 1 : i32] | tensor<[1,1920,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1920 + d1, d2), memory_config: (60, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1920,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1920 + d1, d2), memory_config: (60, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1920 : i32, 1 : i32, 1 : i32] | tensor<[1,1920,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 + d2, d3), memory_config: (60, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.07 | 1.53 | +| ttnn.reshape | tensor<[1,1920,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1920 + d1, d2), memory_config: (60, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1920 : i32, 1 : i32, 1 : i32] | tensor<[1,1920,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 + d2, d3), memory_config: (60, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 192 : i32] | tensor<[1,1,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 192 : i32] | tensor<[1,1,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 19 : i32] | tensor<[1,1,19,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 19 : i32] | tensor<[1,1,19,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,19,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19 : i32, 19 : i32] | tensor<[1,1,19,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19 + d1 * 19 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.38 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[1,19,19,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 19 : i32, 19 : i32] | tensor<[1,1,19,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19 + d1 * 19 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,10,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 10 : i32] | tensor<[1,1,1,10,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,10,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 10 : i32] | tensor<[1,1,1,10,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 12 : i32] | tensor<[1,1,1,12,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,12,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 12 : i32] | tensor<[1,1,1,12,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,14,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 14 : i32] | tensor<[1,1,1,14,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,14,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 14 : i32] | tensor<[1,1,1,14,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,15,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 15 : i32] | tensor<[1,1,1,15,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,15,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 15 : i32] | tensor<[1,1,1,15,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 16 : i32] | tensor<[1,1,1,16,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,16,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 16 : i32] | tensor<[1,1,1,16,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 19 : i32] | tensor<[1,1,1,19,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,19,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 19 : i32] | tensor<[1,1,1,19,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,201,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 201 : i32] | tensor<[1,1,1,201,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,201,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 201 : i32] | tensor<[1,1,1,201,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,2048,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 64, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 2048 : i32] | tensor<[1,1,1,2048,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 64, 'tile<32x32, u32>', 'dram') | yes | 0.24 | nan | +| ttnn.reshape | tensor<[1,1,2048,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 64, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 2048 : i32] | tensor<[1,1,1,2048,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 64, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 8, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 256 : i32] | tensor<[1,1,1,256,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 8, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,256,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 8, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 256 : i32] | tensor<[1,1,1,256,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 8, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,25,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 25 : i32] | tensor<[1,1,1,25,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,25,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 25 : i32] | tensor<[1,1,1,25,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 2 : i32] | tensor<[1,1,1,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 2 : i32] | tensor<[1,1,1,2,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,32,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,1,32,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,32,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,1,32,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,5,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 5 : i32] | tensor<[1,1,1,5,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,5,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 5 : i32] | tensor<[1,1,1,5,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,6,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 6 : i32] | tensor<[1,1,1,6,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,6,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 6 : i32] | tensor<[1,1,1,6,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,7,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 7 : i32] | tensor<[1,1,1,7,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,7,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 7 : i32] | tensor<[1,1,1,7,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,8,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 8 : i32] | tensor<[1,1,1,8,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,8,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 8 : i32] | tensor<[1,1,1,8,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,9,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 9 : i32] | tensor<[1,1,1,9,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,1,9,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 9 : i32] | tensor<[1,1,1,9,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,201,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 201 : i32] | tensor<[1,1,201,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,201,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 201 : i32] | tensor<[1,1,201,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 2048 : i32] | tensor<[1,1,2048,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 64, 'tile<32x32, u32>', 'dram') | yes | 0.24 | nan | +| ttnn.reshape | tensor<[1,2048,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 2048 : i32] | tensor<[1,1,2048,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 64, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,23,40,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 23 + d1, d2), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 1 : i32] | tensor<[1,23,40,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,23,40,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 23 + d1, d2), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 1 : i32] | tensor<[1,23,40,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2560,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2560 : i32, 1 : i32] | tensor<[1,2560,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2560 + d1, d2), memory_config: (80, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2560,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2560 : i32, 1 : i32] | tensor<[1,2560,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2560 + d1, d2), memory_config: (80, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2560,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2560 + d1, d2), memory_config: (80, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2560 : i32, 1 : i32, 1 : i32] | tensor<[1,2560,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 + d2, d3), memory_config: (80, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.22 | 2.44 | +| ttnn.reshape | tensor<[1,2560,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2560 + d1, d2), memory_config: (80, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2560 : i32, 1 : i32, 1 : i32] | tensor<[1,2560,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 + d2, d3), memory_config: (80, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 8, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32] | tensor<[1,1,256,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 8, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,256,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 8, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 256 : i32] | tensor<[1,1,256,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 8, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,25,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32] | tensor<[1,1,25,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,25,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 25 : i32] | tensor<[1,1,25,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 2 : i32] | tensor<[1,1,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 2 : i32] | tensor<[1,1,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,30,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 30 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 30 : i32, 40 : i32] | tensor<[1,1,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30 + d1 * 30 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.66 | 0.48 | +| ttnn.reshape | tensor<[1,30,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 30 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 30 : i32, 40 : i32] | tensor<[1,1,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30 + d1 * 30 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 1 : i32] | tensor<[1,320,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 1 : i32] | tensor<[1,320,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,320,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 1 : i32, 1 : i32] | tensor<[1,320,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 + d2, d3), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 2.0120150326822457e+34 | +| ttnn.reshape | tensor<[1,320,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 1 : i32, 1 : i32] | tensor<[1,320,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 + d2, d3), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,32,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,32,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,32,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 1 : i32] | tensor<[1,32,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 1 : i32] | tensor<[1,32,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 32 : i32, 128 : i32] | tensor<[1,1,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32 + d1 * 32 + d2, d3), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 32 : i32, 128 : i32] | tensor<[1,1,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32 + d1 * 32 + d2, d3), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 32 : i32, 32 : i32] | tensor<[1,1,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32 + d1 * 32 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 32 : i32, 32 : i32] | tensor<[1,1,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32 + d1 * 32 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,384,512,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 384 + d1, d2), memory_config: (12, 16, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 384 : i32, 512 : i32] | tensor<[1,1,384,512,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 384 + d1 * 384 + d2, d3), memory_config: (12, 16, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,384,512,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 384 + d1, d2), memory_config: (12, 16, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 384 : i32, 512 : i32] | tensor<[1,1,384,512,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 384 + d1 * 384 + d2, d3), memory_config: (12, 16, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 5 : i32] | tensor<[1,1,5,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,5,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 5 : i32] | tensor<[1,1,5,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,16,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1 : i32, 16 : i32] | tensor<[1,5,1,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 0.22 | 1.67 | +| ttnn.reshape | tensor<[1,5,16,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1 : i32, 16 : i32] | tensor<[1,5,1,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1 : i32, 16 : i32, 1 : i32] | tensor<[1,5,1,16,1,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 80 + d1 * 16 + d2 * 16 + d3, d4), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 0.22 | 1.67 | +| ttnn.reshape | tensor<[1,5,1,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1 : i32, 16 : i32, 1 : i32] | tensor<[1,5,1,16,1,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 80 + d1 * 16 + d2 * 16 + d3, d4), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,5,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 5 : i32, 5 : i32] | tensor<[1,1,5,5,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5 + d1 * 5 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.53 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[1,5,5,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 5 : i32, 5 : i32] | tensor<[1,1,5,5,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5 + d1 * 5 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,60,80,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 60 + d1, d2), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 60 : i32, 80 : i32] | tensor<[1,1,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 60 + d1 * 60 + d2, d3), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.15 | 0.71 | +| ttnn.reshape | tensor<[1,60,80,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 60 + d1, d2), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 60 : i32, 80 : i32] | tensor<[1,1,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 60 + d1 * 60 + d2, d3), memory_config: (2, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 1 : i32] | tensor<[1,640,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 1 : i32] | tensor<[1,640,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,640,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 1 : i32, 1 : i32] | tensor<[1,640,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 + d2, d3), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 8.56 | +| ttnn.reshape | tensor<[1,640,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 1 : i32, 1 : i32] | tensor<[1,640,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 + d2, d3), memory_config: (20, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 1 : i32] | tensor<[1,64,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 1 : i32] | tensor<[1,64,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 6 : i32] | tensor<[1,1,6,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,6,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 6 : i32] | tensor<[1,1,6,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,6,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 6 : i32, 6 : i32] | tensor<[1,1,6,6,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 * 6 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.48 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[1,6,6,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 6 : i32, 6 : i32] | tensor<[1,1,6,6,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 * 6 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,720,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 720 + d1, d2), memory_config: (23, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 720 : i32, 1280 : i32] | tensor<[1,1,720,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 720 + d1 * 720 + d2, d3), memory_config: (23, 40, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,720,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 720 + d1, d2), memory_config: (23, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 720 : i32, 1280 : i32] | tensor<[1,1,720,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 720 + d1 * 720 + d2, d3), memory_config: (23, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32] | tensor<[1,1,7,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32] | tensor<[1,1,7,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32, 64 : i32] | tensor<[1,1,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7 + d1 * 7 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,7,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32, 64 : i32] | tensor<[1,1,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7 + d1 * 7 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,7,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32, 7 : i32] | tensor<[1,1,7,7,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7 + d1 * 7 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.4 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[1,7,7,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32, 7 : i32] | tensor<[1,1,7,7,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7 + d1 * 7 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 8 : i32] | tensor<[1,1,8,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,8,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 8 : i32] | tensor<[1,1,8,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,960,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 960 : i32, 1 : i32] | tensor<[1,960,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 960 + d1, d2), memory_config: (30, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,960,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 960 : i32, 1 : i32] | tensor<[1,960,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 960 + d1, d2), memory_config: (30, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,960,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 960 + d1, d2), memory_config: (30, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 960 : i32, 1 : i32, 1 : i32] | tensor<[1,960,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 + d2, d3), memory_config: (30, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 0.91 | +| ttnn.reshape | tensor<[1,960,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 960 + d1, d2), memory_config: (30, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 960 : i32, 1 : i32, 1 : i32] | tensor<[1,960,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 + d2, d3), memory_config: (30, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32] | tensor<[1,1,9,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,9,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32, 9 : i32] | tensor<[1,1,9,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[200,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [200 : i32, 1 : i32] | tensor<[200,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[200,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, f32>', 'dram') | shape: [200 : i32, 1 : i32] | tensor<[200,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[200,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | shape: [200 : i32, 1 : i32, 1 : i32] | tensor<[200,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[200,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | shape: [200 : i32, 1 : i32, 1 : i32] | tensor<[200,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (7, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2048,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 64, 'tile<32x32, f32>', 'dram') | shape: [2048 : i32, 1 : i32] | tensor<[2048,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2048,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 64, 'tile<32x32, f32>', 'dram') | shape: [2048 : i32, 1 : i32] | tensor<[2048,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2048,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | shape: [2048 : i32, 1 : i32, 1 : i32] | tensor<[2048,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2048,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | shape: [2048 : i32, 1 : i32, 1 : i32] | tensor<[2048,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[20,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [20 : i32, 1 : i32] | tensor<[20,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[20,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [20 : i32, 1 : i32] | tensor<[20,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[20,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [20 : i32, 1 : i32, 1 : i32] | tensor<[20,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[20,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [20 : i32, 1 : i32, 1 : i32] | tensor<[20,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[23,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [23 : i32, 1 : i32] | tensor<[23,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.51 | nan | +| ttnn.reshape | tensor<[23,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [23 : i32, 1 : i32] | tensor<[23,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[240,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [240 : i32, 1 : i32] | tensor<[240,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[240,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [240 : i32, 1 : i32] | tensor<[240,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[240,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | shape: [240 : i32, 1 : i32, 1 : i32] | tensor<[240,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[240,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | shape: [240 : i32, 1 : i32, 1 : i32] | tensor<[240,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32] | tensor<[24,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[24,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32] | tensor<[24,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[24,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[24,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 1 : i32, 1 : i32] | tensor<[24,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2560,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2560 : i32] | tensor<[1,2560,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 80, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2560,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2560 : i32] | tensor<[1,2560,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 80, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [256 : i32, 1 : i32] | tensor<[256,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [256 : i32, 1 : i32] | tensor<[256,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | shape: [256 : i32, 1 : i32, 1 : i32] | tensor<[256,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[272,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 9, 'tile<32x32, f32>', 'dram') | shape: [272 : i32, 1 : i32] | tensor<[272,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (9, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[272,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 9, 'tile<32x32, f32>', 'dram') | shape: [272 : i32, 1 : i32] | tensor<[272,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (9, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[272,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (9, 1, 'tile<32x32, f32>', 'dram') | shape: [272 : i32, 1 : i32, 1 : i32] | tensor<[272,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (9, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[272,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (9, 1, 'tile<32x32, f32>', 'dram') | shape: [272 : i32, 1 : i32, 1 : i32] | tensor<[272,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (9, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[28,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [28 : i32, 1 : i32] | tensor<[28,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[28,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [28 : i32, 1 : i32] | tensor<[28,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[28,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [28 : i32, 1 : i32, 1 : i32] | tensor<[28,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[28,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [28 : i32, 1 : i32, 1 : i32] | tensor<[28,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 1 : i32, 196 : i32] | tensor<[2,1,196,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | yes | 0.04 | nan | +| ttnn.reshape | tensor<[2,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 1 : i32, 196 : i32] | tensor<[2,1,196,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 196 : i32, 1 : i32] | tensor<[2,196,1,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 196 + d1, d2), memory_config: (13, 1, 'tile<32x32, u32>', 'dram') | yes | 0.04 | nan | +| ttnn.reshape | tensor<[2,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 196 : i32, 1 : i32] | tensor<[2,196,1,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 196 + d1, d2), memory_config: (13, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,1,7,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 1 : i32, 1 : i32, 7 : i32] | tensor<[2,1,1,7,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[2,1,7,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 1 : i32, 1 : i32, 7 : i32] | tensor<[2,1,1,7,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 1 : i32, 7 : i32] | tensor<[2,1,7,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[2,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 1 : i32, 7 : i32] | tensor<[2,1,7,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, f32>', 'dram') | shape: [320 : i32, 1 : i32] | tensor<[320,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[320,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, f32>', 'dram') | shape: [320 : i32, 1 : i32] | tensor<[320,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32] | tensor<[1,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[320,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32] | tensor<[1,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[320,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | shape: [320 : i32, 1 : i32, 1 : i32] | tensor<[320,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 1 : i32] | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[3234,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 102, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 1 : i32] | tensor<[3234,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 1 : i32] | tensor<[32,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 1 : i32] | tensor<[32,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 32 : i32] | tensor<[1,32,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.25 | nan | +| ttnn.reshape | tensor<[32,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 32 : i32] | tensor<[1,32,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 1 : i32, 1 : i32] | tensor<[32,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32] | tensor<[1,32,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32] | tensor<[1,32,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[334,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 11, 'tile<32x32, f32>', 'dram') | shape: [334 : i32, 1 : i32] | tensor<[334,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (11, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[334,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 11, 'tile<32x32, f32>', 'dram') | shape: [334 : i32, 1 : i32] | tensor<[334,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (11, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[334,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (11, 1, 'tile<32x32, f32>', 'dram') | shape: [334 : i32, 1 : i32, 1 : i32] | tensor<[334,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (11, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[334,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (11, 1, 'tile<32x32, f32>', 'dram') | shape: [334 : i32, 1 : i32, 1 : i32] | tensor<[334,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (11, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[34,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [34 : i32, 1 : i32] | tensor<[34,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[34,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [34 : i32, 1 : i32] | tensor<[34,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[34,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [34 : i32, 1 : i32, 1 : i32] | tensor<[34,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[34,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [34 : i32, 1 : i32, 1 : i32] | tensor<[34,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[384,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 12, 'tile<32x32, f32>', 'dram') | shape: [384 : i32, 1 : i32] | tensor<[384,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (12, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[384,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 12, 'tile<32x32, f32>', 'dram') | shape: [384 : i32, 1 : i32] | tensor<[384,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (12, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[384,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (12, 1, 'tile<32x32, f32>', 'dram') | shape: [384 : i32, 1 : i32, 1 : i32] | tensor<[384,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (12, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[384,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (12, 1, 'tile<32x32, f32>', 'dram') | shape: [384 : i32, 1 : i32, 1 : i32] | tensor<[384,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (12, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1 : i32] | tensor<[3,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[3,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1 : i32] | tensor<[3,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1 : i32, 1 : i32] | tensor<[3,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[3,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1 : i32, 1 : i32] | tensor<[3,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,320,320,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 320 : i32, 320 : i32] | tensor<[1,3,320,320,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 320 + d2, d3), memory_config: (30, 10, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[40,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [40 : i32, 1 : i32] | tensor<[40,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[40,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [40 : i32, 1 : i32] | tensor<[40,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[40,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [40 : i32, 1 : i32, 1 : i32] | tensor<[40,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[40,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [40 : i32, 1 : i32, 1 : i32] | tensor<[40,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[462,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [462 : i32, 1 : i32] | tensor<[462,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[462,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [462 : i32, 1 : i32] | tensor<[462,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[462,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | shape: [462 : i32, 1 : i32, 1 : i32] | tensor<[462,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[462,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | shape: [462 : i32, 1 : i32, 1 : i32] | tensor<[462,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[46,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [46 : i32, 1 : i32] | tensor<[46,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[46,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [46 : i32, 1 : i32] | tensor<[46,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[46,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [46 : i32, 1 : i32, 1 : i32] | tensor<[46,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[46,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [46 : i32, 1 : i32, 1 : i32] | tensor<[46,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[480,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [480 : i32, 1 : i32] | tensor<[480,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[480,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [480 : i32, 1 : i32] | tensor<[480,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[480,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | shape: [480 : i32, 1 : i32, 1 : i32] | tensor<[480,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[480,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | shape: [480 : i32, 1 : i32, 1 : i32] | tensor<[480,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[512,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [512 : i32, 1 : i32] | tensor<[512,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[512,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, f32>', 'dram') | shape: [512 : i32, 1 : i32] | tensor<[512,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[512,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [512 : i32, 1 : i32, 1 : i32] | tensor<[512,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[512,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [512 : i32, 1 : i32, 1 : i32] | tensor<[512,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[576,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [576 : i32, 1 : i32] | tensor<[576,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[576,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 18, 'tile<32x32, f32>', 'dram') | shape: [576 : i32, 1 : i32] | tensor<[576,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[576,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | shape: [576 : i32, 1 : i32, 1 : i32] | tensor<[576,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[576,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | shape: [576 : i32, 1 : i32, 1 : i32] | tensor<[576,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[58,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [58 : i32, 1 : i32] | tensor<[58,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[58,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [58 : i32, 1 : i32] | tensor<[58,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[58,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [58 : i32, 1 : i32, 1 : i32] | tensor<[58,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[58,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [58 : i32, 1 : i32, 1 : i32] | tensor<[58,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,5,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32] | tensor<[1,5,5,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.53 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[5,5,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 5 : i32] | tensor<[1,5,5,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, f32>', 'dram') | shape: [640 : i32, 1 : i32] | tensor<[640,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[640,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, f32>', 'dram') | shape: [640 : i32, 1 : i32] | tensor<[640,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32] | tensor<[1,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[640,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32] | tensor<[1,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[640,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[640,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | shape: [640 : i32, 1 : i32, 1 : i32] | tensor<[640,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 1 : i32] | tensor<[64,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[64,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 1 : i32] | tensor<[64,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32] | tensor<[1,64,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[64,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32] | tensor<[1,64,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[64,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 1 : i32, 1 : i32] | tensor<[64,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[672,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [672 : i32, 1 : i32] | tensor<[672,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[672,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 21, 'tile<32x32, f32>', 'dram') | shape: [672 : i32, 1 : i32] | tensor<[672,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[672,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | shape: [672 : i32, 1 : i32, 1 : i32] | tensor<[672,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[672,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | shape: [672 : i32, 1 : i32, 1 : i32] | tensor<[672,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (21, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[68,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [68 : i32, 1 : i32] | tensor<[68,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[68,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [68 : i32, 1 : i32] | tensor<[68,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[68,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [68 : i32, 1 : i32, 1 : i32] | tensor<[68,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[68,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [68 : i32, 1 : i32, 1 : i32] | tensor<[68,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,15,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 15 : i32, 15 : i32] | tensor<[1,6,15,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 25.38 | +| ttnn.reshape | tensor<[6,15,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 15 : i32, 15 : i32] | tensor<[1,6,15,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 1 : i32] | tensor<[1,6,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.14 | 25.12 | +| ttnn.reshape | tensor<[6,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1 : i32, 1 : i32] | tensor<[1,6,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,6,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 6 : i32] | tensor<[1,6,6,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.48 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[6,6,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 6 : i32] | tensor<[1,6,6,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[72,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [72 : i32, 1 : i32] | tensor<[72,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[72,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [72 : i32, 1 : i32] | tensor<[72,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[72,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [72 : i32, 1 : i32, 1 : i32] | tensor<[72,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[72,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [72 : i32, 1 : i32, 1 : i32] | tensor<[72,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[78,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [78 : i32, 1 : i32] | tensor<[78,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[78,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [78 : i32, 1 : i32] | tensor<[78,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[78,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [78 : i32, 1 : i32, 1 : i32] | tensor<[78,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[78,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [78 : i32, 1 : i32, 1 : i32] | tensor<[78,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 7 : i32] | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.64 | nan | +| ttnn.reshape | tensor<[7,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 7 : i32] | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,7,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32] | tensor<[1,7,7,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.4 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[7,7,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 7 : i32] | tensor<[1,7,7,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[80,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [80 : i32, 1 : i32] | tensor<[80,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[80,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [80 : i32, 1 : i32] | tensor<[80,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[80,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [80 : i32, 1 : i32, 1 : i32] | tensor<[80,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[80,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [80 : i32, 1 : i32, 1 : i32] | tensor<[80,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 10 : i32, 10 : i32] | tensor<[1,8,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 16.25 | +| ttnn.reshape | tensor<[8,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 10 : i32, 10 : i32] | tensor<[1,8,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 1 : i32] | tensor<[1,8,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.28 | 5.47 | +| ttnn.reshape | tensor<[8,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 1 : i32, 1 : i32] | tensor<[1,8,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[960,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 30, 'tile<32x32, f32>', 'dram') | shape: [960 : i32, 1 : i32] | tensor<[960,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (30, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[960,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 30, 'tile<32x32, f32>', 'dram') | shape: [960 : i32, 1 : i32] | tensor<[960,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (30, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[960,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 960 : i32] | tensor<[1,960,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 30, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[960,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 30, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 960 : i32] | tensor<[1,960,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 30, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[960,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (30, 1, 'tile<32x32, f32>', 'dram') | shape: [960 : i32, 1 : i32, 1 : i32] | tensor<[960,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (30, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[960,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (30, 1, 'tile<32x32, f32>', 'dram') | shape: [960 : i32, 1 : i32, 1 : i32] | tensor<[960,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (30, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[96,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [96 : i32, 1 : i32] | tensor<[96,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[96,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [96 : i32, 1 : i32] | tensor<[96,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[96,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [96 : i32, 1 : i32, 1 : i32] | tensor<[96,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[96,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [96 : i32, 1 : i32, 1 : i32] | tensor<[96,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[98,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [98 : i32, 1 : i32] | tensor<[98,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[98,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [98 : i32, 1 : i32] | tensor<[98,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[98,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [98 : i32, 1 : i32, 1 : i32] | tensor<[98,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[98,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [98 : i32, 1 : i32, 1 : i32] | tensor<[98,1,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32] | tensor<[1,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 192 : i32] | tensor<[1,100,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 6, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[100,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 192 : i32] | tensor<[1,100,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 64, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 2048 : i32] | tensor<[100,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[100,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 64, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 2048 : i32] | tensor<[100,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 256 : i32] | tensor<[100,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[100,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 256 : i32] | tensor<[100,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 8 : i32, 32 : i32] | tensor<[100,8,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[100,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 8 : i32, 32 : i32] | tensor<[100,8,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 64, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 1 : i32, 2048 : i32] | tensor<[100,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[100,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 64, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 1 : i32, 2048 : i32] | tensor<[100,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 1 : i32, 256 : i32] | tensor<[100,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[100,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 1 : i32, 256 : i32] | tensor<[100,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,4,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 4 : i32] | tensor<[1,100,4,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.06 | 25.25 | +| ttnn.reshape | tensor<[100,4,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 4 : i32] | tensor<[1,100,4,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,8,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 256 : i32] | tensor<[100,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[100,8,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (25, 1, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 256 : i32] | tensor<[100,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[100,92,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 92 : i32] | tensor<[1,100,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.09 | 51.75 | +| ttnn.reshape | tensor<[100,92,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 100 : i32, 92 : i32] | tensor<[1,100,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1024,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 1 : i32, 1 : i32] | tensor<[1,1024,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1024,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 1 : i32, 1 : i32] | tensor<[1,1024,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1024,160,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1024,160,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1024,5120,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 160, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 5120 : i32] | tensor<[1,1024,5120,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 160, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1024,5120,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 160, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 5120 : i32] | tensor<[1,1024,5120,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 160, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1024,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1024,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [10 : i32, 1 : i32] | tensor<[10,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[10,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [10 : i32, 1 : i32] | tensor<[10,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32] | tensor<[1,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[10,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 10 : i32] | tensor<[1,10,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,250002,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7813, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 250002 : i32] | tensor<[1,10,250002,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 7813, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[10,250002,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7813, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 250002 : i32] | tensor<[1,10,250002,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 7813, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 3072 : i32] | tensor<[1,10,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[10,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 768 : i32] | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1200,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 1280 : i32] | tensor<[1,1200,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1200,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 1280 : i32] | tensor<[1,1200,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1200,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1200,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[120,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [120 : i32, 1 : i32] | tensor<[120,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 1 : i32, 1 : i32] | tensor<[1,128,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 128 + d1 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[128,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 1 : i32, 1 : i32] | tensor<[1,128,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 128 + d1 + d2, d3), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[128,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 4, 'tile<32x32, f32>', 'dram') | shape: [128 : i32, 1 : i32] | tensor<[128,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [12 : i32, 1 : i32] | tensor<[12,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.38 | nan | +| ttnn.reshape | tensor<[12,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [12 : i32, 1 : i32] | tensor<[12,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,10,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 10 : i32] | tensor<[1,12,10,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,10,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 10 : i32] | tensor<[1,12,10,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,10,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 64 : i32] | tensor<[1,12,10,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,10,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 10 : i32, 64 : i32] | tensor<[1,12,10,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,12,12,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 12 : i32, 12 : i32] | tensor<[1,12,12,12,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,12,12,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 12 : i32, 12 : i32] | tensor<[1,12,12,12,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,12,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 12 : i32, 64 : i32] | tensor<[1,12,12,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,12,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 12 : i32, 64 : i32] | tensor<[1,12,12,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,14,14,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 14 : i32, 14 : i32] | tensor<[1,12,14,14,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 14 + d2, d3), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,14,14,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 14 : i32, 14 : i32] | tensor<[1,12,14,14,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 14 + d2, d3), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,14,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 14 : i32, 64 : i32] | tensor<[1,12,14,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 14 + d2, d3), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,14,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 14 : i32, 64 : i32] | tensor<[1,12,14,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 14 + d2, d3), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,16,16,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32, 16 : i32] | tensor<[1,12,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[12,16,16,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32, 16 : i32] | tensor<[1,12,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,16,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32, 64 : i32] | tensor<[1,12,16,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,16,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 16 : i32, 64 : i32] | tensor<[1,12,16,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,197,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 197 : i32] | tensor<[1,12,197,197,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 7, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,197,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 7, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 197 : i32] | tensor<[1,12,197,197,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 7, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,197,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 64 : i32] | tensor<[1,12,197,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,197,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 197 : i32, 64 : i32] | tensor<[1,12,197,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,25,25,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 25 : i32, 25 : i32] | tensor<[1,12,25,25,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 25 + d2, d3), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,25,25,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 25 : i32, 25 : i32] | tensor<[1,12,25,25,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 25 + d2, d3), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,25,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (10, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 25 : i32, 64 : i32] | tensor<[1,12,25,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 25 + d2, d3), memory_config: (10, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,25,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (10, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 25 : i32, 64 : i32] | tensor<[1,12,25,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 25 + d2, d3), memory_config: (10, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 2 : i32] | tensor<[1,12,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.2 | 1.89 | +| ttnn.reshape | tensor<[12,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 2 : i32] | tensor<[1,12,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 3072 : i32] | tensor<[1,12,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 3072 : i32] | tensor<[1,12,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,50,50,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 50 : i32, 50 : i32] | tensor<[1,12,50,50,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 50 + d2, d3), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,50,50,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 50 : i32, 50 : i32] | tensor<[1,12,50,50,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 50 + d2, d3), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,50,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 50 : i32, 64 : i32] | tensor<[1,12,50,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 50 + d2, d3), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,50,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 50 : i32, 64 : i32] | tensor<[1,12,50,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 50 + d2, d3), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 768 : i32] | tensor<[1,12,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 768 : i32] | tensor<[1,12,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 7 : i32, 64 : i32] | tensor<[1,12,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 7 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 7 : i32, 64 : i32] | tensor<[1,12,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 7 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 7 : i32, 7 : i32] | tensor<[1,12,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 7 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 7 : i32, 7 : i32] | tensor<[1,12,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 7 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 9 : i32, 64 : i32] | tensor<[1,12,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 9 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 9 : i32, 64 : i32] | tensor<[1,12,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 9 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[12,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 9 : i32, 9 : i32] | tensor<[1,12,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 9 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[12,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 12 : i32, 9 : i32, 9 : i32] | tensor<[1,12,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 9 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1445,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1445 : i32, 192 : i32] | tensor<[1,1445,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1445,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1445 : i32, 192 : i32] | tensor<[1,1445,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1445,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (46, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1445 : i32, 768 : i32] | tensor<[1,1445,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1445,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (46, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1445 : i32, 768 : i32] | tensor<[1,1445,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [14 : i32, 1 : i32] | tensor<[14,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.39 | nan | +| ttnn.reshape | tensor<[14,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [14 : i32, 1 : i32] | tensor<[14,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 14 : i32] | tensor<[1,14,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.39 | nan | +| ttnn.reshape | tensor<[14,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 14 : i32] | tensor<[1,14,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 7 : i32, 2048 : i32] | tensor<[2,7,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[14,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 7 : i32, 2048 : i32] | tensor<[2,7,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 2 : i32] | tensor<[1,14,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 10.25 | +| ttnn.reshape | tensor<[14,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 2 : i32] | tensor<[1,14,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 3072 : i32] | tensor<[1,14,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[14,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 3072 : i32] | tensor<[1,14,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 7 : i32, 512 : i32] | tensor<[2,7,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[14,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 7 : i32, 512 : i32] | tensor<[2,7,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[14,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 768 : i32] | tensor<[1,14,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[14,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 768 : i32] | tensor<[1,14,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16384,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 128 : i32] | tensor<[1,16384,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16384,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 128 : i32] | tensor<[1,16384,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16384,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16384,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 16 : i32] | tensor<[1,16,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.17 | nan | +| ttnn.reshape | tensor<[16,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 16 : i32] | tensor<[1,16,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,19,19,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 19 : i32, 19 : i32] | tensor<[1,16,19,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 19 + d2, d3), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | inf | +| ttnn.reshape | tensor<[16,19,19,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 19 : i32, 19 : i32] | tensor<[1,16,19,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 19 + d2, d3), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,19,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 19 : i32, 64 : i32] | tensor<[1,16,19,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 19 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,19,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 19 : i32, 64 : i32] | tensor<[1,16,19,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 19 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,256,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (128, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 256 : i32, 256 : i32] | tensor<[1,16,256,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 256 + d2, d3), memory_config: (128, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,256,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (128, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 256 : i32, 256 : i32] | tensor<[1,16,256,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 256 + d2, d3), memory_config: (128, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,256,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (128, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 256 : i32, 64 : i32] | tensor<[1,16,256,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 256 + d2, d3), memory_config: (128, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,256,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (128, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 256 : i32, 64 : i32] | tensor<[1,16,256,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 256 + d2, d3), memory_config: (128, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 3072 : i32] | tensor<[1,16,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 3072 : i32] | tensor<[1,16,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,32,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 32 : i32, 32 : i32] | tensor<[1,16,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,32,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 32 : i32, 32 : i32] | tensor<[1,16,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,32,96,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 32 : i32, 96 : i32] | tensor<[1,16,32,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,32,96,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 32 : i32, 96 : i32] | tensor<[1,16,32,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,6,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 6 : i32, 64 : i32] | tensor<[1,16,6,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 6 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,6,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 6 : i32, 64 : i32] | tensor<[1,16,6,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 6 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,6,6,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 6 : i32, 6 : i32] | tensor<[1,16,6,6,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 6 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,6,6,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 6 : i32, 6 : i32] | tensor<[1,16,6,6,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 6 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 768 : i32] | tensor<[1,16,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 768 : i32] | tensor<[1,16,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [2 : i32, 8 : i32, 7 : i32, 64 : i32] | tensor<[2,8,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 7 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [2 : i32, 8 : i32, 7 : i32, 64 : i32] | tensor<[2,8,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 7 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [2 : i32, 8 : i32, 7 : i32, 7 : i32] | tensor<[2,8,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 7 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [2 : i32, 8 : i32, 7 : i32, 7 : i32] | tensor<[2,8,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 7 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,9,128,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 9 : i32, 128 : i32] | tensor<[1,16,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,9,128,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 9 : i32, 128 : i32] | tensor<[1,16,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 9 : i32, 64 : i32] | tensor<[1,16,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 9 : i32, 64 : i32] | tensor<[1,16,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[16,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 9 : i32, 9 : i32] | tensor<[1,16,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[16,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 16 : i32, 9 : i32, 9 : i32] | tensor<[1,16,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19200,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 256 : i32] | tensor<[1,19200,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[19200,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 256 : i32] | tensor<[1,19200,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19200,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 64 : i32] | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[19200,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 64 : i32] | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 1 : i32] | tensor<[197,1,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[197,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 1 : i32] | tensor<[197,1,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32] | tensor<[1,197,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[197,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 7, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32] | tensor<[1,197,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 1024 : i32] | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,197,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 7, 'tile<32x32, u32>', 'dram') | shape: [38809 : i32] | tensor<[38809,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1213, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[197,197,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 7, 'tile<32x32, u32>', 'dram') | shape: [38809 : i32] | tensor<[38809,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1213, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 3072 : i32] | tensor<[1,197,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[197,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 3072 : i32] | tensor<[1,197,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 4096 : i32] | tensor<[1,197,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[197,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 4096 : i32] | tensor<[1,197,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[197,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 768 : i32] | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32] | tensor<[1,19,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[19,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32] | tensor<[1,19,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [19 : i32, 1 : i32] | tensor<[19,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.46 | nan | +| ttnn.reshape | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [19 : i32, 1 : i32] | tensor<[19,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 1024 : i32] | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[19,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 1024 : i32] | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[19,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 4096 : i32] | tensor<[1,19,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[19,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 4096 : i32] | tensor<[1,19,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,100,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 6, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 192 : i32] | tensor<[100,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 6, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,100,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (4, 6, 'tile<32x32, bf16>', 'dram') | shape: [100 : i32, 192 : i32] | tensor<[100,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (4, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,14,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14336 + d1 * 14 + d2, d3), memory_config: (448, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 196 : i32] | tensor<[1,1024,196,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 7, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1024,14,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 14336 + d1 * 14 + d2, d3), memory_config: (448, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 196 : i32] | tensor<[1,1024,196,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 8.56 | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 5 : i32, 32 : i32] | tensor<[1,1024,5,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 5 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 5 : i32, 32 : i32] | tensor<[1,1024,5,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 5 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 160 : i32] | tensor<[1,32,32,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 160 : i32] | tensor<[1,32,32,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 160 : i32] | tensor<[1024,160,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 160 : i32] | tensor<[1024,160,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.5 | 10.06 | +| ttnn.reshape | tensor<[1,1024,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32] | tensor<[1,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 3.25 | +| ttnn.reshape | tensor<[1,1024,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32] | tensor<[1,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,2560,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 80, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 2560 : i32] | tensor<[1024,2560,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 80, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,2560,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 80, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 2560 : i32] | tensor<[1024,2560,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 80, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 16 : i32, 16 : i32] | tensor<[1,1024,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 31.0 | +| ttnn.reshape | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 16 : i32, 16 : i32] | tensor<[1,1024,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 1.2 | +| ttnn.reshape | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 256 : i32] | tensor<[1,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,5,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 5 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,5,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 5 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 160 : i32] | tensor<[1,1024,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 8 : i32, 80 : i32] | tensor<[1,1024,8,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 8 + d2, d3), memory_config: (256, 3, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 8 : i32, 80 : i32] | tensor<[1,1024,8,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 8 + d2, d3), memory_config: (256, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 15.81 | +| ttnn.reshape | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 640 : i32] | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 640 : i32] | tensor<[1024,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1024 : i32, 640 : i32] | tensor<[1024,640,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1024,8,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 8 + d2, d3), memory_config: (256, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 5.117527783771926e+37 | +| ttnn.reshape | tensor<[1,1024,8,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 8 + d2, d3), memory_config: (256, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 10 : i32] | tensor<[1,10,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.95 | nan | +| ttnn.reshape | tensor<[1,10,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 10 : i32] | tensor<[1,10,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 16 : i32, 64 : i32] | tensor<[1,10,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 16 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 16 : i32, 64 : i32] | tensor<[1,10,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 16 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 1024 : i32] | tensor<[10,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 1024 : i32] | tensor<[10,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 768 : i32] | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 768 : i32] | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 768 : i32] | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 16 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 1024 : i32] | tensor<[1,10,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 16 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 1024 : i32] | tensor<[1,10,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 2048 : i32] | tensor<[10,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 2048 : i32] | tensor<[10,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 3072 : i32] | tensor<[10,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 3072 : i32] | tensor<[10,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 4096 : i32] | tensor<[10,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 4096 : i32] | tensor<[10,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 8 : i32, 64 : i32] | tensor<[1,10,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 8 : i32, 64 : i32] | tensor<[1,10,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 512 : i32] | tensor<[10,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 512 : i32] | tensor<[10,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 12 : i32, 64 : i32] | tensor<[1,10,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 12 : i32, 64 : i32] | tensor<[1,10,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 12 : i32, 64 : i32] | tensor<[1,10,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 768 : i32] | tensor<[10,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [10 : i32, 768 : i32] | tensor<[10,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,10,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 512 : i32] | tensor<[1,10,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,10,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 10 : i32, 512 : i32] | tensor<[1,10,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1200,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 1280 : i32] | tensor<[1,1200,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 3.88 | +| ttnn.reshape | tensor<[1,1200,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 1280 : i32] | tensor<[1,1200,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 5 : i32, 64 : i32] | tensor<[1,1200,5,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 5 + d2, d3), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 5 : i32, 64 : i32] | tensor<[1,1200,5,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 5 + d2, d3), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 320 : i32] | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 30 : i32, 40 : i32, 320 : i32] | tensor<[1,30,40,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1200 + d1 * 40 + d2, d3), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1200 : i32, 320 : i32] | tensor<[1200,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | shape: [1200 : i32, 320 : i32] | tensor<[1200,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1200,5,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 5 + d2, d3), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1200,5,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 5 + d2, d3), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1200 : i32, 320 : i32] | tensor<[1,1200,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (38, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,1200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 38, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 30 : i32, 40 : i32] | tensor<[1,1280,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 38400 + d1 * 30 + d2, d3), memory_config: (1200, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 29.75 | +| ttnn.reshape | tensor<[1,1280,1200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 38, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 30 : i32, 40 : i32] | tensor<[1,1280,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 38400 + d1 * 30 + d2, d3), memory_config: (1200, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 16 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 40 : i32, 256 : i32] | tensor<[1,32,40,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.71 | 46.25 | +| ttnn.reshape | tensor<[1,1280,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 16 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 40 : i32, 256 : i32] | tensor<[1,32,40,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32] | tensor<[1,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | yes | -0.02 | 2.95 | +| ttnn.reshape | tensor<[1,1280,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32] | tensor<[1,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 38400 + d1 * 30 + d2, d3), memory_config: (1200, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 1200 : i32] | tensor<[1,1280,1200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 38, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1280,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 38400 + d1 * 30 + d2, d3), memory_config: (1200, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 1200 : i32] | tensor<[1,1280,1200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 38, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 320 : i32] | tensor<[1,1280,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 0.55 | +| ttnn.reshape | tensor<[1,1280,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1280 : i32, 320 : i32] | tensor<[1,1280,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1280 + d1, d2), memory_config: (40, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 32 + d2, d3), memory_config: (1280, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 40 : i32, 1024 : i32] | tensor<[1,32,40,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1280,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 32 + d2, d3), memory_config: (1280, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 40 : i32, 1024 : i32] | tensor<[1,32,40,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1280,8,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 8 + d2, d3), memory_config: (320, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 40 : i32, 64 : i32] | tensor<[1,32,40,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 31.62 | +| ttnn.reshape | tensor<[1,1280,8,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 8 + d2, d3), memory_config: (320, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 40 : i32, 64 : i32] | tensor<[1,32,40,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 16384 : i32] | tensor<[1,128,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 512, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 16384 : i32] | tensor<[1,128,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 512, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 15 + d2, d3), memory_config: (60, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 300 : i32] | tensor<[1,128,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | inf | +| ttnn.reshape | tensor<[1,128,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 15 + d2, d3), memory_config: (60, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 300 : i32] | tensor<[1,128,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 512, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 128 : i32] | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 39.5 | +| ttnn.reshape | tensor<[1,128,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 512, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 128 : i32] | tensor<[1,128,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 32 : i32] | tensor<[1,128,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 0.31 | +| ttnn.reshape | tensor<[1,128,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 32 : i32] | tensor<[1,128,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,4800,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 150, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 60 : i32, 80 : i32] | tensor<[1,128,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 60 + d2, d3), memory_config: (240, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 8.44 | +| ttnn.reshape | tensor<[1,128,4800,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 150, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 60 : i32, 80 : i32] | tensor<[1,128,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 60 + d2, d3), memory_config: (240, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,128,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 60 + d2, d3), memory_config: (240, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 4800 : i32] | tensor<[1,128,4800,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 150, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,128,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 60 + d2, d3), memory_config: (240, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 4800 : i32] | tensor<[1,128,4800,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (4, 150, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [6 : i32, 2 : i32] | tensor<[6,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [6 : i32, 2 : i32] | tensor<[6,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,10,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 10 : i32, 10 : i32] | tensor<[12,10,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,10,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 10 : i32, 10 : i32] | tensor<[12,10,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,10,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 10 : i32, 64 : i32] | tensor<[12,10,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | yes | 0.04 | 2.39 | +| ttnn.reshape | tensor<[1,12,10,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 10 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 10 : i32, 64 : i32] | tensor<[12,10,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 128 : i32] | tensor<[12,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 128 : i32] | tensor<[12,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,12,12,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 12 : i32, 12 : i32] | tensor<[12,12,12,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,12,12,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 12 : i32, 12 : i32] | tensor<[12,12,12,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 768 : i32] | tensor<[1,12,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 768 : i32] | tensor<[1,12,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,12,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 12 : i32, 64 : i32] | tensor<[12,12,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | yes | 0.09 | 3.74 | +| ttnn.reshape | tensor<[1,12,12,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 12 : i32, 64 : i32] | tensor<[12,12,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,14,14,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 14 + d2, d3), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 14 : i32, 14 : i32] | tensor<[12,14,14,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,14,14,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 14 + d2, d3), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 14 : i32, 14 : i32] | tensor<[12,14,14,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,14,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 14 + d2, d3), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 14 : i32, 64 : i32] | tensor<[12,14,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | yes | 0.03 | 4.21 | +| ttnn.reshape | tensor<[1,12,14,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 14 + d2, d3), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 14 : i32, 64 : i32] | tensor<[12,14,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 16 : i32, 16 : i32] | tensor<[12,16,16,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 16 : i32, 16 : i32] | tensor<[12,16,16,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,16,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 16 : i32, 64 : i32] | tensor<[12,16,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | yes | 0.02 | 2.41 | +| ttnn.reshape | tensor<[1,12,16,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 16 + d2, d3), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 16 : i32, 64 : i32] | tensor<[12,16,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (6, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,197,197,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 7, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 197 : i32, 197 : i32] | tensor<[12,197,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 7, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,197,197,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 7, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 197 : i32, 197 : i32] | tensor<[12,197,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 7, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,197,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 197 : i32, 64 : i32] | tensor<[12,197,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 2, 'tile<32x32, f32>', 'dram') | yes | 0.08 | 2.5 | +| ttnn.reshape | tensor<[1,12,197,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 197 + d2, d3), memory_config: (74, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 197 : i32, 64 : i32] | tensor<[12,197,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (74, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 10 : i32] | tensor<[12,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.13 | 0.94 | +| ttnn.reshape | tensor<[1,12,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 10 : i32] | tensor<[12,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 1 : i32] | tensor<[12,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[1,12,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 1 : i32] | tensor<[12,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 64 : i32] | tensor<[12,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 1 : i32, 64 : i32] | tensor<[12,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,201,201,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 7, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 201 : i32, 201 : i32] | tensor<[12,201,201,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (76, 7, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 1.16 | +| ttnn.reshape | tensor<[1,12,201,201,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 7, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 201 : i32, 201 : i32] | tensor<[12,201,201,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (76, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,201,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 201 : i32, 64 : i32] | tensor<[12,201,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.04 | 12.5 | +| ttnn.reshape | tensor<[1,12,201,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 201 + d2, d3), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 201 : i32, 64 : i32] | tensor<[12,201,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,25,25,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 25 + d2, d3), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 25 : i32, 25 : i32] | tensor<[12,25,25,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,25,25,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 25 + d2, d3), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 25 : i32, 25 : i32] | tensor<[12,25,25,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,25,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 25 + d2, d3), memory_config: (10, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 25 : i32, 64 : i32] | tensor<[12,25,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (10, 2, 'tile<32x32, f32>', 'dram') | yes | 0.03 | 2.38 | +| ttnn.reshape | tensor<[1,12,25,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 25 + d2, d3), memory_config: (10, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 25 : i32, 64 : i32] | tensor<[12,25,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (10, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 3072 : i32] | tensor<[12,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 3072 : i32] | tensor<[12,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,50,50,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 50 + d2, d3), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 50 : i32, 50 : i32] | tensor<[12,50,50,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,50,50,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 50 + d2, d3), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 50 : i32, 50 : i32] | tensor<[12,50,50,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,50,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 50 + d2, d3), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 50 : i32, 64 : i32] | tensor<[12,50,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | yes | 0.08 | 2.3 | +| ttnn.reshape | tensor<[1,12,50,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 50 + d2, d3), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 50 : i32, 64 : i32] | tensor<[12,50,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (19, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 10 : i32] | tensor<[12,64,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | yes | -0.01 | 2.58 | +| ttnn.reshape | tensor<[1,12,64,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 10 : i32] | tensor<[12,64,10,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,12,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 12 : i32] | tensor<[12,64,12,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | yes | -0.0 | 3.66 | +| ttnn.reshape | tensor<[1,12,64,12,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 12 : i32] | tensor<[12,64,12,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,14,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 14 : i32] | tensor<[12,64,14,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | yes | 0.02 | 3.83 | +| ttnn.reshape | tensor<[1,12,64,14,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 14 : i32] | tensor<[12,64,14,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 16 : i32] | tensor<[12,64,16,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | yes | -0.01 | 2.85 | +| ttnn.reshape | tensor<[1,12,64,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 16 : i32] | tensor<[12,64,16,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,197,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 7, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 197 : i32] | tensor<[12,64,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 7, 'tile<32x32, f32>', 'dram') | yes | -0.0 | 2.55 | +| ttnn.reshape | tensor<[1,12,64,197,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 7, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 197 : i32] | tensor<[12,64,197,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 7, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 64 : i32, 1 : i32] | tensor<[12,64,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.05 | 4.3575489382845386e+29 | +| ttnn.reshape | tensor<[1,12,64,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 64 : i32, 1 : i32] | tensor<[12,64,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,201,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 7, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 64 : i32, 201 : i32] | tensor<[12,64,201,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 7, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 14.25 | +| ttnn.reshape | tensor<[1,12,64,201,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 7, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 64 : i32, 201 : i32] | tensor<[12,64,201,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,25,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 25 : i32] | tensor<[12,64,25,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | yes | -0.0 | 2.6 | +| ttnn.reshape | tensor<[1,12,64,25,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 25 : i32] | tensor<[12,64,25,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,50,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 50 : i32] | tensor<[12,64,50,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 2, 'tile<32x32, f32>', 'dram') | yes | -0.0 | 1.55 | +| ttnn.reshape | tensor<[1,12,64,50,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 50 : i32] | tensor<[12,64,50,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 7 : i32] | tensor<[12,64,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | yes | 0.02 | 3.56 | +| ttnn.reshape | tensor<[1,12,64,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 7 : i32] | tensor<[12,64,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 8 : i32] | tensor<[1,768,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | inf | +| ttnn.reshape | tensor<[1,12,64,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 8 : i32] | tensor<[1,768,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 64 : i32, 8 : i32] | tensor<[12,64,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.02 | 4.31 | +| ttnn.reshape | tensor<[1,12,64,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 64 : i32, 8 : i32] | tensor<[12,64,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 9 : i32] | tensor<[12,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | yes | 0.01 | 3.5 | +| ttnn.reshape | tensor<[1,12,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 64 : i32, 9 : i32] | tensor<[12,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (24, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 12 : i32, 64 : i32] | tensor<[1,12,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 12 : i32, 64 : i32] | tensor<[1,12,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 12 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 768 : i32] | tensor<[12,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 12 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 768 : i32] | tensor<[12,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 7 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 7 : i32, 64 : i32] | tensor<[12,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | yes | 0.07 | 1.8 | +| ttnn.reshape | tensor<[1,12,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 7 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 7 : i32, 64 : i32] | tensor<[12,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 7 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 7 : i32, 7 : i32] | tensor<[12,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 7 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 7 : i32, 7 : i32] | tensor<[12,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 8 : i32, 64 : i32] | tensor<[12,8,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 4.81 | +| ttnn.reshape | tensor<[1,12,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 8 : i32, 64 : i32] | tensor<[12,8,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,8,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 8 : i32, 8 : i32] | tensor<[12,8,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 1.0 | +| ttnn.reshape | tensor<[1,12,8,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 8 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [12 : i32, 8 : i32, 8 : i32] | tensor<[12,8,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 9 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 9 : i32, 64 : i32] | tensor<[12,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | yes | 0.08 | 4.12 | +| ttnn.reshape | tensor<[1,12,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 9 + d2, d3), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 9 : i32, 64 : i32] | tensor<[12,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (4, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,12,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 9 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 9 : i32, 9 : i32] | tensor<[12,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,12,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 9 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [12 : i32, 9 : i32, 9 : i32] | tensor<[12,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1445,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1445 : i32, 3 : i32, 64 : i32] | tensor<[1,1445,3,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 3 + d2, d3), memory_config: (136, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1445,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1445 : i32, 3 : i32, 64 : i32] | tensor<[1,1445,3,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 3 + d2, d3), memory_config: (136, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1445,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | shape: [1445 : i32, 192 : i32] | tensor<[1445,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1445,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | shape: [1445 : i32, 192 : i32] | tensor<[1445,192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1445,3,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 3 + d2, d3), memory_config: (136, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1445 : i32, 192 : i32] | tensor<[1,1445,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1445,3,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 3 + d2, d3), memory_config: (136, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1445 : i32, 192 : i32] | tensor<[1,1445,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1445,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 24, 'tile<32x32, bf16>', 'dram') | shape: [1445 : i32, 768 : i32] | tensor<[1445,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (46, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1445,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (46, 24, 'tile<32x32, bf16>', 'dram') | shape: [1445 : i32, 768 : i32] | tensor<[1445,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (46, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 128 : i32] | tensor<[14,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,14,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 128 : i32] | tensor<[14,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 12 + d2, d3), memory_config: (6, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 768 : i32] | tensor<[1,14,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,14,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 12 + d2, d3), memory_config: (6, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 768 : i32] | tensor<[1,14,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 3072 : i32] | tensor<[14,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,14,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 3072 : i32] | tensor<[14,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 12 : i32, 64 : i32] | tensor<[1,14,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 12 + d2, d3), memory_config: (6, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,14,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 14 : i32, 12 : i32, 64 : i32] | tensor<[1,14,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 168 + d1 * 12 + d2, d3), memory_config: (6, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,14,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 768 : i32] | tensor<[14,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,14,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 768 : i32] | tensor<[14,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 15 : i32] | tensor<[1,15,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | -0.17 | nan | +| ttnn.reshape | tensor<[1,15,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 15 : i32] | tensor<[1,15,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [15 : i32, 1024 : i32] | tensor<[15,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,15,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [15 : i32, 1024 : i32] | tensor<[15,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 6 : i32, 64 : i32] | tensor<[1,15,6,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 6 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,15,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 6 : i32, 64 : i32] | tensor<[1,15,6,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 6 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [15 : i32, 384 : i32] | tensor<[15,384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,15,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [15 : i32, 384 : i32] | tensor<[15,384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [15 : i32, 512 : i32] | tensor<[15,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,15,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [15 : i32, 512 : i32] | tensor<[15,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,15,6,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 6 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 384 : i32] | tensor<[1,15,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,15,6,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 6 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 384 : i32] | tensor<[1,15,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,160,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 160 : i32, 32 : i32, 32 : i32] | tensor<[1,160,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 32 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 10.25 | +| ttnn.reshape | tensor<[1,160,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 160 : i32, 32 : i32, 32 : i32] | tensor<[1,160,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 32 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,160,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 16 + d2, d3), memory_config: (80, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 160 : i32, 256 : i32] | tensor<[1,160,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.26 | 23.0 | +| ttnn.reshape | tensor<[1,160,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 16 + d2, d3), memory_config: (80, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 160 : i32, 256 : i32] | tensor<[1,160,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,160,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 160 : i32, 256 : i32] | tensor<[1,160,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 0.83 | +| ttnn.reshape | tensor<[1,160,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 160 : i32, 256 : i32] | tensor<[1,160,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,160,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 32 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 160 : i32, 1024 : i32] | tensor<[1,160,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,160,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 32 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 160 : i32, 1024 : i32] | tensor<[1,160,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (5, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 128 : i32] | tensor<[1,16384,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 8.44 | +| ttnn.reshape | tensor<[1,16384,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 128 : i32] | tensor<[1,16384,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,1,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.12 | 0.78 | +| ttnn.reshape | tensor<[1,16384,1,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 32 : i32] | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 128 : i32, 128 : i32, 32 : i32] | tensor<[1,128,128,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 128 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 1 : i32, 32 : i32] | tensor<[1,16384,1,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 1 : i32, 32 : i32] | tensor<[1,16384,1,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 3.97 | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [16384 : i32, 32 : i32] | tensor<[16384,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [16384 : i32, 32 : i32] | tensor<[16384,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 10 : i32, 10 : i32] | tensor<[16,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 949978046398464.0 | +| ttnn.reshape | tensor<[1,16,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 10 : i32, 10 : i32] | tensor<[16,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 10 : i32, 64 : i32] | tensor<[16,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 0.96 | +| ttnn.reshape | tensor<[1,16,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 10 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 10 : i32, 64 : i32] | tensor<[16,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,128,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 128 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 128 : i32, 9 : i32] | tensor<[16,128,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | yes | -0.0 | 2.38 | +| ttnn.reshape | tensor<[1,16,128,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 128 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 128 : i32, 9 : i32] | tensor<[16,128,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 12 + d2, d3), memory_config: (6, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 768 : i32] | tensor<[1,16,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 12 + d2, d3), memory_config: (6, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 768 : i32] | tensor<[1,16,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1280 : i32] | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 12.0 | +| ttnn.reshape | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1280 : i32] | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 197 : i32, 197 : i32] | tensor<[16,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 1.0 | +| ttnn.reshape | tensor<[1,16,197,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 197 : i32, 197 : i32] | tensor<[16,197,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,197,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 197 : i32, 64 : i32] | tensor<[16,197,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 17.75 | +| ttnn.reshape | tensor<[1,16,197,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 197 + d2, d3), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 197 : i32, 64 : i32] | tensor<[16,197,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,19,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 19 + d2, d3), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 19 : i32, 19 : i32] | tensor<[16,19,19,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 3.3895313892515355e+38 | +| ttnn.reshape | tensor<[1,16,19,19,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 19 + d2, d3), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 19 : i32, 19 : i32] | tensor<[16,19,19,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (10, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,19,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 19 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 19 : i32, 64 : i32] | tensor<[16,19,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,19,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 19 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 19 : i32, 64 : i32] | tensor<[16,19,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 10 : i32] | tensor<[16,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.22 | 0.67 | +| ttnn.reshape | tensor<[1,16,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 10 : i32] | tensor<[16,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 1 : i32] | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[1,16,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 1 : i32] | tensor<[16,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 64 : i32] | tensor<[16,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 1 : i32, 64 : i32] | tensor<[16,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,256,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 256 + d2, d3), memory_config: (128, 8, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 256 : i32, 256 : i32] | tensor<[16,256,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (128, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,256,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 256 + d2, d3), memory_config: (128, 8, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 256 : i32, 256 : i32] | tensor<[16,256,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (128, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,256,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 256 + d2, d3), memory_config: (128, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 256 : i32, 64 : i32] | tensor<[16,256,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (128, 2, 'tile<32x32, f32>', 'dram') | yes | 0.02 | 3.77 | +| ttnn.reshape | tensor<[1,16,256,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 256 + d2, d3), memory_config: (128, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 256 : i32, 64 : i32] | tensor<[16,256,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (128, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 3072 : i32] | tensor<[16,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 3072 : i32] | tensor<[16,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 1 : i32, 32 : i32] | tensor<[16,1,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 1 : i32, 32 : i32] | tensor<[16,1,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 32 : i32, 32 : i32] | tensor<[16,32,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 32 : i32, 32 : i32] | tensor<[16,32,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,32,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 32 : i32, 96 : i32] | tensor<[16,32,96,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 11.12 | +| ttnn.reshape | tensor<[1,16,32,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 32 + d2, d3), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 32 : i32, 96 : i32] | tensor<[16,32,96,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (16, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,5,5,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 5 : i32, 5 : i32] | tensor<[16,5,5,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.06 | 1.0 | +| ttnn.reshape | tensor<[1,16,5,5,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 5 : i32, 5 : i32] | tensor<[16,5,5,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,5,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 5 : i32, 64 : i32] | tensor<[16,5,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | yes | 0.06 | 15.87 | +| ttnn.reshape | tensor<[1,16,5,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 5 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 5 : i32, 64 : i32] | tensor<[16,5,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,64,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 64 : i32, 10 : i32] | tensor<[16,64,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,16,64,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 64 : i32, 10 : i32] | tensor<[16,64,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,64,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 7, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 64 : i32, 197 : i32] | tensor<[16,64,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 38.75 | +| ttnn.reshape | tensor<[1,16,64,197,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 7, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 64 : i32, 197 : i32] | tensor<[16,64,197,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,64,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 64 : i32, 1 : i32] | tensor<[16,64,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | inf | +| ttnn.reshape | tensor<[1,16,64,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 64 : i32, 1 : i32] | tensor<[16,64,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,64,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 8, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 256 : i32] | tensor<[16,64,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 8, 'tile<32x32, f32>', 'dram') | yes | 0.0 | 3.88 | +| ttnn.reshape | tensor<[1,16,64,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 8, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 256 : i32] | tensor<[16,64,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,64,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 5 : i32] | tensor<[16,64,5,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | yes | 0.0 | 12.44 | +| ttnn.reshape | tensor<[1,16,64,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 5 : i32] | tensor<[16,64,5,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,64,6,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 6 : i32] | tensor<[16,64,6,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | yes | 0.0 | 2.51 | +| ttnn.reshape | tensor<[1,16,64,6,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 6 : i32] | tensor<[16,64,6,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 9 : i32] | tensor<[16,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | yes | -0.01 | 2.99 | +| ttnn.reshape | tensor<[1,16,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 64 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 64 : i32, 9 : i32] | tensor<[16,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,6,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 6 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 6 : i32, 64 : i32] | tensor<[16,6,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,6,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 6 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 6 : i32, 64 : i32] | tensor<[16,6,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,6,6,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 6 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 6 : i32, 6 : i32] | tensor<[16,6,6,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,6,6,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 6 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 6 : i32, 6 : i32] | tensor<[16,6,6,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 12 : i32, 64 : i32] | tensor<[1,16,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 12 + d2, d3), memory_config: (6, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 12 : i32, 64 : i32] | tensor<[1,16,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 12 + d2, d3), memory_config: (6, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 768 : i32] | tensor<[16,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [16 : i32, 768 : i32] | tensor<[16,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 4, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 9 : i32, 128 : i32] | tensor<[16,9,128,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 4, 'tile<32x32, f32>', 'dram') | yes | 0.05 | 2.07 | +| ttnn.reshape | tensor<[1,16,9,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 4, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 9 : i32, 128 : i32] | tensor<[16,9,128,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 9 : i32, 64 : i32] | tensor<[16,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | yes | 0.08 | 2.82 | +| ttnn.reshape | tensor<[1,16,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 9 : i32, 64 : i32] | tensor<[16,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,16,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 9 : i32, 9 : i32] | tensor<[16,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,16,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 9 + d2, d3), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 9 : i32, 9 : i32] | tensor<[16,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19200,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 64 : i32] | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.17 | 0.44 | +| ttnn.reshape | tensor<[1,19200,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 64 : i32] | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19200,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 256 : i32] | tensor<[1,19200,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 5.0 | +| ttnn.reshape | tensor<[1,19200,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 256 : i32] | tensor<[1,19200,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 64 : i32] | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 120 : i32, 160 : i32, 64 : i32] | tensor<[1,120,160,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 160 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 1 : i32, 64 : i32] | tensor<[1,19200,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 1 : i32, 64 : i32] | tensor<[1,19200,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [19200 : i32, 64 : i32] | tensor<[19200,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [19200 : i32, 64 : i32] | tensor<[19200,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1920,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 16 + d2, d3), memory_config: (960, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 60 : i32, 256 : i32] | tensor<[1,32,60,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 60 + d2, d3), memory_config: (60, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.66 | 37.25 | +| ttnn.reshape | tensor<[1,1920,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 16 + d2, d3), memory_config: (960, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 60 : i32, 256 : i32] | tensor<[1,32,60,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 60 + d2, d3), memory_config: (60, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1920,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 61440 + d1 * 32 + d2, d3), memory_config: (1920, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 60 : i32, 1024 : i32] | tensor<[1,32,60,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 60 + d2, d3), memory_config: (60, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1920,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 61440 + d1 * 32 + d2, d3), memory_config: (1920, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 60 : i32, 1024 : i32] | tensor<[1,32,60,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 60 + d2, d3), memory_config: (60, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,192,32,42,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6144 + d1 * 32 + d2, d3), memory_config: (192, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 192 : i32, 1344 : i32] | tensor<[1,192,1344,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 192 + d1, d2), memory_config: (6, 42, 'tile<32x32, bf16>', 'dram') | yes | 0.04 | 18.5 | +| ttnn.reshape | tensor<[1,192,32,42,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6144 + d1 * 32 + d2, d3), memory_config: (192, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 192 : i32, 1344 : i32] | tensor<[1,192,1344,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 192 + d1, d2), memory_config: (6, 42, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,192,4150,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 192 + d1, d2), memory_config: (6, 130, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 192 : i32, 50 : i32, 83 : i32] | tensor<[1,192,50,83,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 50 + d2, d3), memory_config: (300, 3, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 8.38 | +| ttnn.reshape | tensor<[1,192,4150,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 192 + d1, d2), memory_config: (6, 130, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 192 : i32, 50 : i32, 83 : i32] | tensor<[1,192,50,83,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 50 + d2, d3), memory_config: (300, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 16 : i32, 64 : i32] | tensor<[1,197,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 16 + d2, d3), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 16 : i32, 64 : i32] | tensor<[1,197,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 16 + d2, d3), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 1024 : i32] | tensor<[197,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 1024 : i32] | tensor<[197,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 12 + d2, d3), memory_config: (74, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 768 : i32] | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 12 + d2, d3), memory_config: (74, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 768 : i32] | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 16 + d2, d3), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 1024 : i32] | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 3152 + d1 * 16 + d2, d3), memory_config: (99, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 1024 : i32] | tensor<[1,197,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 3072 : i32] | tensor<[197,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 3072 : i32] | tensor<[197,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 128, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 4096 : i32] | tensor<[197,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 128, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 4096 : i32] | tensor<[197,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 12 : i32, 64 : i32] | tensor<[1,197,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 12 + d2, d3), memory_config: (74, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 197 : i32, 12 : i32, 64 : i32] | tensor<[1,197,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2364 + d1 * 12 + d2, d3), memory_config: (74, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 768 : i32] | tensor<[197,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,197,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 768 : i32] | tensor<[197,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 19 : i32] | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.4 | nan | +| ttnn.reshape | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 19 : i32] | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [19 : i32] | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,19,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [19 : i32] | tensor<[19,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 16 : i32, 64 : i32] | tensor<[1,19,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 16 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 16 : i32, 64 : i32] | tensor<[1,19,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 16 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19 : i32, 16 : i32, 64 : i32] | tensor<[1,19,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 304 + d1 * 16 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 1024 : i32] | tensor<[19,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 1024 : i32] | tensor<[19,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,256008,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 8001, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 256008 : i32] | tensor<[19,256008,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 8001, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,19,256008,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 8001, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 256008 : i32] | tensor<[19,256008,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 8001, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,19,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 4096 : i32] | tensor<[19,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,19,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [19 : i32, 4096 : i32] | tensor<[19,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 1 : i32] | tensor<[1,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32] | tensor<[1,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32] | tensor<[1,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16 : i32, 64 : i32] | tensor<[1,1,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 * 16 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 16 : i32, 64 : i32] | tensor<[1,1,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 * 16 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32] | tensor<[1,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32] | tensor<[1,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12,16,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 * 12 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 192 : i32] | tensor<[1,192,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 6, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,12,16,i32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 * 12 + d2, d3), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 192 : i32] | tensor<[1,192,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 6, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12,16,2,i32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 192 + d1 * 192 + d2 * 16 + d3, d4), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 192 : i32, 2 : i32] | tensor<[1,192,2,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 192 + d1, d2), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,12,16,2,i32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 192 + d1 * 192 + d2 * 16 + d3, d4), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 192 : i32, 2 : i32] | tensor<[1,192,2,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 192 + d1, d2), memory_config: (6, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 * 12 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 768 : i32] | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 * 12 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 768 : i32] | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 256 : i32] | tensor<[1,16384,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,16384,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 256 : i32] | tensor<[1,16384,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,16384,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 16384 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16384 : i32, 32 : i32] | tensor<[1,16384,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 16384 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 * 16 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32] | tensor<[1,1,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16 + d1 * 16 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1024 : i32] | tensor<[1,1,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 300 : i32] | tensor<[1,19200,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 0.23 | +| ttnn.reshape | tensor<[1,1,19200,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 300 : i32] | tensor<[1,19200,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 64 : i32] | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,19200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 19200 + d1 * 19200 + d2, d3), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 19200 : i32, 64 : i32] | tensor<[1,19200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 19200 + d1, d2), memory_config: (600, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 6 + d1 * 6 + d2 * 6 + d3, d4), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32] | tensor<[1,6,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1,6,4,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 6 + d1 * 6 + d2 * 6 + d3, d4), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32] | tensor<[1,6,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,1,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 6 + d1 * 6 + d2 * 6 + d3, d4), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32] | tensor<[1,6,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,1,1,6,91,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 6 + d1 * 6 + d2 * 6 + d3, d4), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32] | tensor<[1,6,91,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32] | tensor<[1,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32] | tensor<[1,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 256 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 32 : i32] | tensor<[1,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 256 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 32 : i32] | tensor<[1,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 300 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 64 : i32] | tensor<[1,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 300 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 64 : i32] | tensor<[1,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 3072 : i32] | tensor<[1,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 3072 : i32] | tensor<[1,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 32 : i32] | tensor<[1,1,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32 + d1 * 32 + d2, d3), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 256 : i32] | tensor<[1,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 4.94 | +| ttnn.reshape | tensor<[1,1,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32 + d1 * 32 + d2, d3), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 256 : i32] | tensor<[1,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 6 : i32, 64 : i32] | tensor<[1,1,6,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 * 6 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 6 : i32, 64 : i32] | tensor<[1,1,6,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 * 6 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 384 : i32] | tensor<[1,384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 384 : i32] | tensor<[1,384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32] | tensor<[1,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32] | tensor<[1,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 8 : i32, 64 : i32] | tensor<[1,1,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 * 8 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 8 : i32, 64 : i32] | tensor<[1,1,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 * 8 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32] | tensor<[1,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32] | tensor<[1,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,64,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 64 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 300 : i32] | tensor<[1,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 3.83 | +| ttnn.reshape | tensor<[1,1,64,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 64 + d2, d3), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 300 : i32] | tensor<[1,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,6,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 * 6 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 384 : i32] | tensor<[1,1,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,6,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 * 6 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 384 : i32] | tensor<[1,1,384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 12, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12 : i32, 64 : i32] | tensor<[1,1,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 * 12 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 12 : i32, 64 : i32] | tensor<[1,1,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 12 + d1 * 12 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32] | tensor<[1,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32] | tensor<[1,1,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32] | tensor<[1,1,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7 + d1 * 7 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32, 64 : i32] | tensor<[1,1,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7 + d1 * 7 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7 + d1 * 7 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 7 : i32, 64 : i32] | tensor<[1,1,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7 + d1 * 7 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,1,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 * 8 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 512 : i32] | tensor<[1,1,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,1,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 * 8 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 512 : i32] | tensor<[1,1,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,201,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 12 + d2, d3), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 201 : i32, 768 : i32] | tensor<[1,201,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,201,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 12 + d2, d3), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 201 : i32, 768 : i32] | tensor<[1,201,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,201,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | shape: [201 : i32, 3072 : i32] | tensor<[201,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,201,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | shape: [201 : i32, 3072 : i32] | tensor<[201,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,201,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 201 : i32, 12 : i32, 64 : i32] | tensor<[1,201,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 12 + d2, d3), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,201,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 201 : i32, 12 : i32, 64 : i32] | tensor<[1,201,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2412 + d1 * 12 + d2, d3), memory_config: (76, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,201,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [201 : i32, 768 : i32] | tensor<[201,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,201,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [201 : i32, 768 : i32] | tensor<[201,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 8 : i32, 160 : i32] | tensor<[1,2048,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 8 + d2, d3), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2048,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 8 : i32, 160 : i32] | tensor<[1,2048,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 8 + d2, d3), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 15 + d2, d3), memory_config: (960, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 300 : i32] | tensor<[1,2048,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 24.62 | +| ttnn.reshape | tensor<[1,2048,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 15 + d2, d3), memory_config: (960, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 300 : i32] | tensor<[1,2048,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32] | tensor<[1,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | inf | +| ttnn.reshape | tensor<[1,2048,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32] | tensor<[1,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 8 : i32, 32 : i32] | tensor<[1,2048,8,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 8 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2048,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 8 : i32, 32 : i32] | tensor<[1,2048,8,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 8 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 15 : i32, 20 : i32] | tensor<[1,2048,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 15 + d2, d3), memory_config: (960, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 42.0 | +| ttnn.reshape | tensor<[1,2048,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 15 : i32, 20 : i32] | tensor<[1,2048,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 15 + d2, d3), memory_config: (960, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 512 : i32] | tensor<[1,2048,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 16, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 3.09 | +| ttnn.reshape | tensor<[1,2048,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 512 : i32] | tensor<[1,2048,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | shape: [2048 : i32, 768 : i32] | tensor<[2048,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2048,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | shape: [2048 : i32, 768 : i32] | tensor<[2048,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | shape: [2048 : i32, 768 : i32] | tensor<[2048,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2048,8,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 8 + d2, d3), memory_config: (512, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 768 : i32] | tensor<[1,2048,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2048,8,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 8 + d2, d3), memory_config: (512, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 768 : i32] | tensor<[1,2048,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,23,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 23 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 920 : i32] | tensor<[1,920,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,23,40,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 23 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 920 : i32] | tensor<[1,920,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,23,40,64,2,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 58880 + d1 * 2560 + d2 * 64 + d3, d4), memory_config: (1840, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 128 : i32] | tensor<[1,23,40,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 4, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,23,40,64,2,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 58880 + d1 * 2560 + d2 * 64 + d3, d4), memory_config: (1840, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 23 : i32, 40 : i32, 128 : i32] | tensor<[1,23,40,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 920 + d1 * 40 + d2, d3), memory_config: (29, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,24,10,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 240 + d1 * 10 + d2, d3), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 10 : i32, 10 : i32] | tensor<[1,6,4,10,10,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 240 + d1 * 40 + d2 * 10 + d3, d4), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,24,10,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 240 + d1 * 10 + d2, d3), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 10 : i32, 10 : i32] | tensor<[1,6,4,10,10,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 240 + d1 * 40 + d2 * 10 + d3, d4), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,24,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 24 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 1 : i32, 1 : i32] | tensor<[1,6,4,1,1,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 24 + d1 * 4 + d2 + d3, d4), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,24,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 24 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 1 : i32, 1 : i32] | tensor<[1,6,4,1,1,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 24 + d1 * 4 + d2 + d3, d4), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,24,20,20,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 480 + d1 * 20 + d2, d3), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 20 : i32, 20 : i32] | tensor<[1,6,4,20,20,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 480 + d1 * 80 + d2 * 20 + d3, d4), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,24,20,20,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 480 + d1 * 20 + d2, d3), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 20 : i32, 20 : i32] | tensor<[1,6,4,20,20,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 480 + d1 * 80 + d2 * 20 + d3, d4), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,24,2,2,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 48 + d1 * 2 + d2, d3), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 2 : i32, 2 : i32] | tensor<[1,6,4,2,2,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 48 + d1 * 8 + d2 * 2 + d3, d4), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,24,2,2,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 48 + d1 * 2 + d2, d3), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 2 : i32, 2 : i32] | tensor<[1,6,4,2,2,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 48 + d1 * 8 + d2 * 2 + d3, d4), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,24,3,3,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 3 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 3 : i32, 3 : i32] | tensor<[1,6,4,3,3,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 72 + d1 * 12 + d2 * 3 + d3, d4), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,24,3,3,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 3 + d2, d3), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 3 : i32, 3 : i32] | tensor<[1,6,4,3,3,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 72 + d1 * 12 + d2 * 3 + d3, d4), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,24,5,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 5 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 5 : i32, 5 : i32] | tensor<[1,6,4,5,5,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 120 + d1 * 20 + d2 * 5 + d3, d4), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,24,5,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 120 + d1 * 5 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 4 : i32, 5 : i32, 5 : i32] | tensor<[1,6,4,5,5,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 120 + d1 * 20 + d2 * 5 + d3, d4), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2560,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 16 + d2, d3), memory_config: (1280, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 80 : i32, 256 : i32] | tensor<[1,32,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 80 + d2, d3), memory_config: (80, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.55 | 44.5 | +| ttnn.reshape | tensor<[1,2560,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 16 + d2, d3), memory_config: (1280, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 80 : i32, 256 : i32] | tensor<[1,32,80,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 80 + d2, d3), memory_config: (80, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2560,8,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 8 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 80 : i32, 64 : i32] | tensor<[1,32,80,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 80 + d2, d3), memory_config: (80, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 30.75 | +| ttnn.reshape | tensor<[1,2560,8,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 8 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 80 : i32, 64 : i32] | tensor<[1,32,80,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 80 + d2, d3), memory_config: (80, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1024 : i32] | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 4.81 | +| ttnn.reshape | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1024 : i32] | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 16 : i32, 64 : i32] | tensor<[1,256,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 16 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 16 : i32, 64 : i32] | tensor<[1,256,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 16 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 32 : i32, 32 : i32] | tensor<[1,256,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 32 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 11.06 | +| ttnn.reshape | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 32 : i32, 32 : i32] | tensor<[1,256,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 32 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1024 : i32] | tensor<[256,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1024 : i32] | tensor<[256,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 120 + d2, d3), memory_config: (960, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 19200 : i32] | tensor<[1,256,19200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 600, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 120 + d2, d3), memory_config: (960, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 19200 : i32] | tensor<[1,256,19200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 600, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 8 : i32, 160 : i32] | tensor<[1,256,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 8 : i32, 160 : i32] | tensor<[1,256,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 1280 : i32] | tensor<[1,16,16,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 8 : i32, 160 : i32] | tensor<[1,256,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1280 : i32] | tensor<[256,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 1280 : i32] | tensor<[256,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 5 : i32, 32 : i32] | tensor<[1,256,5,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 5 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 5 : i32, 32 : i32] | tensor<[1,256,5,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 5 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 160 : i32] | tensor<[256,160,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 160 : i32] | tensor<[256,160,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 512, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 128 : i32, 128 : i32] | tensor<[1,256,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 128 + d2, d3), memory_config: (1024, 4, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 2.91 | +| ttnn.reshape | tensor<[1,256,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 512, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 128 : i32, 128 : i32] | tensor<[1,256,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 128 + d2, d3), memory_config: (1024, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 16 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.47 | 42.25 | +| ttnn.reshape | tensor<[1,256,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 16 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 16 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1024 : i32] | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 16 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1024 : i32] | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,19200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 600, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 120 : i32, 160 : i32] | tensor<[1,256,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 120 + d2, d3), memory_config: (960, 5, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 22.5 | +| ttnn.reshape | tensor<[1,256,19200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 600, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 120 : i32, 160 : i32] | tensor<[1,256,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 120 + d2, d3), memory_config: (960, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,23,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5888 + d1 * 23 + d2, d3), memory_config: (184, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 920 : i32] | tensor<[1,256,920,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 29, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 9.56 | +| ttnn.reshape | tensor<[1,256,23,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5888 + d1 * 23 + d2, d3), memory_config: (184, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 920 : i32] | tensor<[1,256,920,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 29, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 256 : i32] | tensor<[1,16,16,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 16 : i32, 16 : i32, 256 : i32] | tensor<[1,16,16,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 16 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 16 : i32, 16 : i32] | tensor<[1,256,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 16 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 38.5 | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 16 : i32, 16 : i32] | tensor<[1,256,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 16 + d2, d3), memory_config: (128, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 19.62 | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 8 : i32, 32 : i32] | tensor<[1,256,8,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 8 : i32, 32 : i32] | tensor<[1,256,8,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 256 : i32] | tensor<[256,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 256 : i32] | tensor<[256,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1 : i32, 32 : i32] | tensor<[1,256,1,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1 : i32, 32 : i32] | tensor<[1,256,1,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 32 : i32] | tensor<[256,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 32 : i32] | tensor<[256,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 64 : i32, 64 : i32] | tensor<[1,256,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 64 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 26.25 | +| ttnn.reshape | tensor<[1,256,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 64 : i32, 64 : i32] | tensor<[1,256,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 64 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 4096 : i32] | tensor<[256,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 4096 : i32] | tensor<[256,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,5120,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 160, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 5120 : i32] | tensor<[256,5120,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 160, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,5120,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 160, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 5120 : i32] | tensor<[256,5120,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 160, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 512 : i32] | tensor<[256,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 512 : i32] | tensor<[256,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 2 : i32, 32 : i32] | tensor<[1,256,2,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 2 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 2 : i32, 32 : i32] | tensor<[1,256,2,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 2 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 64 : i32] | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 0.41 | +| ttnn.reshape | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 64 : i32] | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 64 : i32] | tensor<[256,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 64 : i32] | tensor<[256,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 64 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 4096 : i32] | tensor<[1,256,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 64 + d2, d3), memory_config: (512, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 4096 : i32] | tensor<[1,256,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 8 : i32, 96 : i32] | tensor<[1,256,8,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 3, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 8 : i32, 96 : i32] | tensor<[1,256,8,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 768 : i32] | tensor<[256,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | shape: [256 : i32, 768 : i32] | tensor<[256,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1280 : i32] | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1280 : i32] | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1280 : i32] | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,256,8,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,256,8,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 8 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,25,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32] | tensor<[1,25,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,25,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32] | tensor<[1,25,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,25,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 12 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 768 : i32] | tensor<[1,25,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,25,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 12 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 768 : i32] | tensor<[1,25,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,25,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [25 : i32, 3072 : i32] | tensor<[25,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,25,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [25 : i32, 3072 : i32] | tensor<[25,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,25,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 12 : i32, 64 : i32] | tensor<[1,25,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 12 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,25,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 12 : i32, 64 : i32] | tensor<[1,25,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 12 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,25,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [25 : i32, 768 : i32] | tensor<[25,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,25,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [25 : i32, 768 : i32] | tensor<[25,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,27,27,12,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 729 + d1 * 27 + d2, d3), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | shape: [729 : i32, 12 : i32] | tensor<[729,12,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.08 | 5.66 | +| ttnn.reshape | tensor<[1,27,27,12,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 729 + d1 * 27 + d2, d3), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | shape: [729 : i32, 12 : i32] | tensor<[729,12,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,27,27,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 729 + d1 * 27 + d2, d3), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | shape: [729 : i32, 16 : i32] | tensor<[729,16,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.79 | 6.91 | +| ttnn.reshape | tensor<[1,27,27,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 729 + d1 * 27 + d2, d3), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | shape: [729 : i32, 16 : i32] | tensor<[729,16,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 256 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 256 : i32, 32 : i32] | tensor<[2,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 1.77 | +| ttnn.reshape | tensor<[1,2,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 256 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 256 : i32, 32 : i32] | tensor<[2,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 300 + d2, d3), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 300 : i32, 64 : i32] | tensor<[2,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.34 | 1.62 | +| ttnn.reshape | tensor<[1,2,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 300 + d2, d3), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 300 : i32, 64 : i32] | tensor<[2,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 32 + d2, d3), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 32 : i32, 256 : i32] | tensor<[2,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 2.94 | +| ttnn.reshape | tensor<[1,2,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 32 + d2, d3), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 32 : i32, 256 : i32] | tensor<[2,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 8, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 4096 : i32, 256 : i32] | tensor<[2,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (256, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,2,4096,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 8, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 4096 : i32, 256 : i32] | tensor<[2,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (256, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,4096,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 4096 : i32, 32 : i32] | tensor<[2,4096,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.29 | 10.62 | +| ttnn.reshape | tensor<[1,2,4096,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 4096 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 4096 : i32, 32 : i32] | tensor<[2,4096,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,4800,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 10, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 4800 : i32, 300 : i32] | tensor<[2,4800,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (300, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 0.59 | +| ttnn.reshape | tensor<[1,2,4800,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 10, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 4800 : i32, 300 : i32] | tensor<[2,4800,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (300, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 4800 : i32, 64 : i32] | tensor<[2,4800,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.48 | 7.0 | +| ttnn.reshape | tensor<[1,2,4800,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 4800 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 4800 : i32, 64 : i32] | tensor<[2,4800,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,2,64,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 128 + d1 * 64 + d2, d3), memory_config: (4, 10, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 64 : i32, 300 : i32] | tensor<[2,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (4, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 3.08 | +| ttnn.reshape | tensor<[1,2,64,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 128 + d1 * 64 + d2, d3), memory_config: (4, 10, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 64 : i32, 300 : i32] | tensor<[2,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (4, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 2 : i32, 64 : i32] | tensor<[1,300,2,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 2 + d2, d3), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 2 : i32, 64 : i32] | tensor<[1,300,2,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 2 + d2, d3), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | shape: [300 : i32, 128 : i32] | tensor<[300,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | shape: [300 : i32, 128 : i32] | tensor<[300,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 2048 : i32] | tensor<[1,300,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 7.88 | +| ttnn.reshape | tensor<[1,300,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 2048 : i32] | tensor<[1,300,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 5 : i32, 64 : i32] | tensor<[1,300,5,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1500 + d1 * 5 + d2, d3), memory_config: (47, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 5 : i32, 64 : i32] | tensor<[1,300,5,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1500 + d1 * 5 + d2, d3), memory_config: (47, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [300 : i32, 320 : i32] | tensor<[300,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [300 : i32, 320 : i32] | tensor<[300,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 512 : i32] | tensor<[1,15,20,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 15 : i32, 20 : i32, 512 : i32] | tensor<[1,15,20,512,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 * 20 + d2, d3), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 8 : i32, 64 : i32] | tensor<[1,300,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 8 + d2, d3), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 8 : i32, 64 : i32] | tensor<[1,300,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 8 + d2, d3), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [300 : i32, 512 : i32] | tensor<[300,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [300 : i32, 512 : i32] | tensor<[300,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 1 : i32, 64 : i32] | tensor<[1,300,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 1 : i32, 64 : i32] | tensor<[1,300,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 300 + d1 + d2, d3), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [300 : i32, 64 : i32] | tensor<[300,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [300 : i32, 64 : i32] | tensor<[300,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,300,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 8 + d2, d3), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 512 : i32] | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,300,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 8 + d2, d3), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 512 : i32] | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,320,1200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 38, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 30 : i32, 40 : i32] | tensor<[1,320,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 30 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,320,1200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 38, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 30 : i32, 40 : i32] | tensor<[1,320,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 30 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,320,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 15 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 300 : i32] | tensor<[1,320,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | inf | +| ttnn.reshape | tensor<[1,320,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 15 + d2, d3), memory_config: (150, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 300 : i32] | tensor<[1,320,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,320,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 30 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 1200 : i32] | tensor<[1,320,1200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 38, 'tile<32x32, bf16>', 'dram') | yes | -0.02 | 138.0 | +| ttnn.reshape | tensor<[1,320,30,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 30 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 320 : i32, 1200 : i32] | tensor<[1,320,1200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 320 + d1, d2), memory_config: (10, 38, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,320,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 32 + d2, d3), memory_config: (320, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 10 : i32, 1024 : i32] | tensor<[1,32,10,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 10 + d2, d3), memory_config: (10, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,320,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 32 + d2, d3), memory_config: (320, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 10 : i32, 1024 : i32] | tensor<[1,32,10,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 10 + d2, d3), memory_config: (10, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,320,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 64 + d2, d3), memory_config: (640, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 10 : i32, 4096 : i32] | tensor<[1,32,10,4096,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 10 + d2, d3), memory_config: (10, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,320,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 64 + d2, d3), memory_config: (640, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 10 : i32, 4096 : i32] | tensor<[1,32,10,4096,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 10 + d2, d3), memory_config: (10, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,10,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 10 + d2, d3), memory_config: (10, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 320 : i32, 32 : i32, 32 : i32] | tensor<[1,320,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 32 + d2, d3), memory_config: (320, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,10,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 10 + d2, d3), memory_config: (10, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 320 : i32, 32 : i32, 32 : i32] | tensor<[1,320,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 32 + d2, d3), memory_config: (320, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,10,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 10 + d2, d3), memory_config: (10, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 320 : i32, 64 : i32, 64 : i32] | tensor<[1,320,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 64 + d2, d3), memory_config: (640, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,10,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 10 + d2, d3), memory_config: (10, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 320 : i32, 64 : i32, 64 : i32] | tensor<[1,320,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 64 + d2, d3), memory_config: (640, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,11008,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 344, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 11008 : i32] | tensor<[32,11008,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 344, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,11008,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 344, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 11008 : i32] | tensor<[32,11008,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 344, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 128 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 16384 : i32] | tensor<[1,32,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 128 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 16384 : i32] | tensor<[1,32,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,128,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 128 + d2, d3), memory_config: (128, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 128 : i32, 32 : i32] | tensor<[32,128,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (128, 1, 'tile<32x32, f32>', 'dram') | yes | 0.0 | 1.97 | +| ttnn.reshape | tensor<[1,32,128,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 128 + d2, d3), memory_config: (128, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 128 : i32, 32 : i32] | tensor<[32,128,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 128 + d1, d2), memory_config: (128, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,1536,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1536 : i32] | tensor<[32,1536,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,1536,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 1536 : i32] | tensor<[32,1536,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 128 : i32, 128 : i32] | tensor<[1,32,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 128 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 12.0 | +| ttnn.reshape | tensor<[1,32,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 128 : i32, 128 : i32] | tensor<[1,32,128,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 128 + d2, d3), memory_config: (128, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 16 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 256 : i32] | tensor<[1,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.55 | 72.0 | +| ttnn.reshape | tensor<[1,32,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 16 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 256 : i32] | tensor<[1,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 1 : i32] | tensor<[1,32,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 1 : i32] | tensor<[1,32,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,20,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 640 : i32, 32 : i32, 32 : i32] | tensor<[1,640,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 32 + d2, d3), memory_config: (640, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,20,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 640 : i32, 32 : i32, 32 : i32] | tensor<[1,640,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 32 + d2, d3), memory_config: (640, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,20,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 640 : i32, 16 : i32, 16 : i32] | tensor<[1,640,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 16 + d2, d3), memory_config: (320, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,20,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 640 : i32, 16 : i32, 16 : i32] | tensor<[1,640,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 16 + d2, d3), memory_config: (320, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,20,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 640 : i32, 64 : i32, 64 : i32] | tensor<[1,640,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 64 + d2, d3), memory_config: (1280, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,20,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 640 : i32, 64 : i32, 64 : i32] | tensor<[1,640,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 64 + d2, d3), memory_config: (1280, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 256 : i32] | tensor<[1,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 0.8 | +| ttnn.reshape | tensor<[1,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 256 : i32] | tensor<[1,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,30,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 30 + d2, d3), memory_config: (30, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 960 : i32, 32 : i32, 32 : i32] | tensor<[1,960,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 32 + d2, d3), memory_config: (960, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,30,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 30 + d2, d3), memory_config: (30, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 960 : i32, 32 : i32, 32 : i32] | tensor<[1,960,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 32 + d2, d3), memory_config: (960, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,30,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 30 + d2, d3), memory_config: (30, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 960 : i32, 64 : i32, 64 : i32] | tensor<[1,960,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 61440 + d1 * 64 + d2, d3), memory_config: (1920, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,30,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 30 + d2, d3), memory_config: (30, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 960 : i32, 64 : i32, 64 : i32] | tensor<[1,960,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 61440 + d1 * 64 + d2, d3), memory_config: (1920, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 4096 : i32] | tensor<[1,32,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 4096 : i32] | tensor<[1,32,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 32 : i32, 128 : i32] | tensor<[32,32,128,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (32, 4, 'tile<32x32, f32>', 'dram') | yes | 0.03 | 2.39 | +| ttnn.reshape | tensor<[1,32,32,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 32 : i32, 128 : i32] | tensor<[32,32,128,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (32, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 32 : i32, 32 : i32] | tensor<[32,32,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [32 : i32, 32 : i32, 32 : i32] | tensor<[32,32,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 8.81 | +| ttnn.reshape | tensor<[1,32,32,640,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1024 : i32, 640 : i32] | tensor<[1,1024,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (32, 20, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 128 : i32] | tensor<[1,32,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 128 : i32] | tensor<[1,32,32,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 4096 : i32] | tensor<[32,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 4096 : i32] | tensor<[32,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,40,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1280 : i32, 32 : i32, 32 : i32] | tensor<[1,1280,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 32 + d2, d3), memory_config: (1280, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,40,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1280 : i32, 32 : i32, 32 : i32] | tensor<[1,1280,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 32 + d2, d3), memory_config: (1280, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,40,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1280 : i32, 16 : i32, 16 : i32] | tensor<[1,1280,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 16 + d2, d3), memory_config: (640, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,32,40,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1280 : i32, 16 : i32, 16 : i32] | tensor<[1,1280,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 16 + d2, d3), memory_config: (640, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,40,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1280 : i32, 8 : i32, 8 : i32] | tensor<[1,1280,8,8,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 8 + d2, d3), memory_config: (320, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,40,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 40 + d2, d3), memory_config: (40, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1280 : i32, 8 : i32, 8 : i32] | tensor<[1,1280,8,8,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 8 + d2, d3), memory_config: (320, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,4608,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 144, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 16 : i32, 3 : i32, 96 : i32] | tensor<[1,32,16,3,96,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1536 + d1 * 48 + d2 * 3 + d3, d4), memory_config: (48, 3, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,32,4608,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 144, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 16 : i32, 3 : i32, 96 : i32] | tensor<[1,32,16,3,96,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1536 + d1 * 48 + d2 * 3 + d3, d4), memory_config: (48, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,60,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 60 + d2, d3), memory_config: (60, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1920 : i32, 32 : i32, 32 : i32] | tensor<[1,1920,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 61440 + d1 * 32 + d2, d3), memory_config: (1920, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.07 | +| ttnn.reshape | tensor<[1,32,60,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 60 + d2, d3), memory_config: (60, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1920 : i32, 32 : i32, 32 : i32] | tensor<[1,1920,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 61440 + d1 * 32 + d2, d3), memory_config: (1920, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,60,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 60 + d2, d3), memory_config: (60, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1920 : i32, 16 : i32, 16 : i32] | tensor<[1,1920,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 16 + d2, d3), memory_config: (960, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,32,60,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1920 + d1 * 60 + d2, d3), memory_config: (60, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 1920 : i32, 16 : i32, 16 : i32] | tensor<[1,1920,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 16 + d2, d3), memory_config: (960, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,6144,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 192, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 6144 : i32] | tensor<[32,6144,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 192, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,6144,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 192, 'tile<32x32, bf16>', 'dram') | shape: [32 : i32, 6144 : i32] | tensor<[32,6144,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 192, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,80,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 80 + d2, d3), memory_config: (80, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2560 : i32, 16 : i32, 16 : i32] | tensor<[1,2560,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 16 + d2, d3), memory_config: (1280, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[1,32,80,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 80 + d2, d3), memory_config: (80, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2560 : i32, 16 : i32, 16 : i32] | tensor<[1,2560,16,16,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 16 + d2, d3), memory_config: (1280, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,32,80,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 80 + d2, d3), memory_config: (80, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2560 : i32, 8 : i32, 8 : i32] | tensor<[1,2560,8,8,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 8 + d2, d3), memory_config: (640, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,32,80,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2560 + d1 * 80 + d2, d3), memory_config: (80, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2560 : i32, 8 : i32, 8 : i32] | tensor<[1,2560,8,8,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 8 + d2, d3), memory_config: (640, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,1445,1445,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 1445 + d2, d3), memory_config: (136, 46, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1445 : i32, 1445 : i32] | tensor<[3,1445,1445,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (136, 46, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,3,1445,1445,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 1445 + d2, d3), memory_config: (136, 46, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1445 : i32, 1445 : i32] | tensor<[3,1445,1445,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (136, 46, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,1445,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 1445 + d2, d3), memory_config: (136, 2, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1445 : i32, 64 : i32] | tensor<[3,1445,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (136, 2, 'tile<32x32, f32>', 'dram') | yes | 0.14 | 3.83 | +| ttnn.reshape | tensor<[1,3,1445,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 1445 + d2, d3), memory_config: (136, 2, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1445 : i32, 64 : i32] | tensor<[3,1445,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (136, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,3,64,1445,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 64 + d2, d3), memory_config: (6, 46, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 64 : i32, 1445 : i32] | tensor<[3,64,1445,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (6, 46, 'tile<32x32, f32>', 'dram') | yes | 0.0 | 2.86 | +| ttnn.reshape | tensor<[1,3,64,1445,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 192 + d1 * 64 + d2, d3), memory_config: (6, 46, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 64 : i32, 1445 : i32] | tensor<[3,64,1445,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (6, 46, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 40, 'tile<32x32, bf16>', 'dram') | shape: [4096 : i32, 1280 : i32] | tensor<[4096,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 40, 'tile<32x32, bf16>', 'dram') | shape: [4096 : i32, 1280 : i32] | tensor<[4096,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 256 : i32] | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 3.27 | +| ttnn.reshape | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 256 : i32] | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,2,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 2 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 64 : i32] | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,2,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 2 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 64 : i32] | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 8 : i32, 40 : i32] | tensor<[1,4096,8,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 8 + d2, d3), memory_config: (1024, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 8 : i32, 40 : i32] | tensor<[1,4096,8,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 8 + d2, d3), memory_config: (1024, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 320 : i32] | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [4096 : i32, 320 : i32] | tensor<[4096,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [4096 : i32, 320 : i32] | tensor<[4096,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 2 : i32, 32 : i32] | tensor<[1,4096,2,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 2 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 2 : i32, 32 : i32] | tensor<[1,4096,2,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 2 + d2, d3), memory_config: (256, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 64 : i32] | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 4.56 | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 64 : i32] | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 64 : i32] | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 64 : i32] | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [4096 : i32, 64 : i32] | tensor<[4096,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [4096 : i32, 64 : i32] | tensor<[4096,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4096,8,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 8 + d2, d3), memory_config: (1024, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 320 : i32] | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 2.41 | +| ttnn.reshape | tensor<[1,4096,8,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 8 + d2, d3), memory_config: (1024, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 320 : i32] | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 2 : i32, 64 : i32] | tensor<[1,4800,2,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 2 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 2 : i32, 64 : i32] | tensor<[1,4800,2,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 2 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 128 : i32] | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 60 : i32, 80 : i32, 128 : i32] | tensor<[1,60,80,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4800 + d1 * 80 + d2, d3), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [4800 : i32, 128 : i32] | tensor<[4800,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [4800 : i32, 128 : i32] | tensor<[4800,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4800,2,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 2 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 128 : i32] | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,4800,2,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9600 + d1 * 2 + d2, d3), memory_config: (300, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 128 : i32] | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,4800,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 512 : i32] | tensor<[1,4800,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 4.0 | +| ttnn.reshape | tensor<[1,4800,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 512 : i32] | tensor<[1,4800,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,50,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 12 + d2, d3), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 50 : i32, 768 : i32] | tensor<[1,50,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,50,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 12 + d2, d3), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 50 : i32, 768 : i32] | tensor<[1,50,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,50,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 96, 'tile<32x32, bf16>', 'dram') | shape: [50 : i32, 3072 : i32] | tensor<[50,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,50,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 96, 'tile<32x32, bf16>', 'dram') | shape: [50 : i32, 3072 : i32] | tensor<[50,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,50,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 50 : i32, 12 : i32, 64 : i32] | tensor<[1,50,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 12 + d2, d3), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,50,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 50 : i32, 12 : i32, 64 : i32] | tensor<[1,50,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 600 + d1 * 12 + d2, d3), memory_config: (19, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,50,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | shape: [50 : i32, 768 : i32] | tensor<[50,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,50,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | shape: [50 : i32, 768 : i32] | tensor<[50,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 128 : i32] | tensor<[1,512,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 4, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 0.32 | +| ttnn.reshape | tensor<[1,512,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 128 : i32] | tensor<[1,512,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 15 + d2, d3), memory_config: (240, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 300 : i32] | tensor<[1,512,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 272.0 | +| ttnn.reshape | tensor<[1,512,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 15 + d2, d3), memory_config: (240, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 300 : i32] | tensor<[1,512,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32] | tensor<[1,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | -0.02 | 6.66 | +| ttnn.reshape | tensor<[1,512,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32] | tensor<[1,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,4800,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 150, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 60 : i32, 80 : i32] | tensor<[1,512,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 60 + d2, d3), memory_config: (960, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 25.12 | +| ttnn.reshape | tensor<[1,512,4800,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 150, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 60 : i32, 80 : i32] | tensor<[1,512,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 60 + d2, d3), memory_config: (960, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,512,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 60 + d2, d3), memory_config: (960, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 4800 : i32] | tensor<[1,512,4800,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 150, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | inf | +| ttnn.reshape | tensor<[1,512,60,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 60 + d2, d3), memory_config: (960, 3, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 4800 : i32] | tensor<[1,512,4800,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 512 + d1, d2), memory_config: (16, 150, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,546,10,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5460 + d1 * 10 + d2, d3), memory_config: (171, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 10 : i32, 10 : i32] | tensor<[1,6,91,10,10,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 5460 + d1 * 910 + d2 * 10 + d3, d4), memory_config: (171, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,546,10,10,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5460 + d1 * 10 + d2, d3), memory_config: (171, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 10 : i32, 10 : i32] | tensor<[1,6,91,10,10,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 5460 + d1 * 910 + d2 * 10 + d3, d4), memory_config: (171, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,546,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 546 + d1 + d2, d3), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 1 : i32, 1 : i32] | tensor<[1,6,91,1,1,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 546 + d1 * 91 + d2 + d3, d4), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,546,1,1,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 546 + d1 + d2, d3), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 1 : i32, 1 : i32] | tensor<[1,6,91,1,1,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 546 + d1 * 91 + d2 + d3, d4), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,546,20,20,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10920 + d1 * 20 + d2, d3), memory_config: (342, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 20 : i32, 20 : i32] | tensor<[1,6,91,20,20,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 10920 + d1 * 1820 + d2 * 20 + d3, d4), memory_config: (342, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,546,20,20,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10920 + d1 * 20 + d2, d3), memory_config: (342, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 20 : i32, 20 : i32] | tensor<[1,6,91,20,20,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 10920 + d1 * 1820 + d2 * 20 + d3, d4), memory_config: (342, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,546,2,2,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1092 + d1 * 2 + d2, d3), memory_config: (35, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 2 : i32, 2 : i32] | tensor<[1,6,91,2,2,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1092 + d1 * 182 + d2 * 2 + d3, d4), memory_config: (35, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,546,2,2,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1092 + d1 * 2 + d2, d3), memory_config: (35, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 2 : i32, 2 : i32] | tensor<[1,6,91,2,2,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1092 + d1 * 182 + d2 * 2 + d3, d4), memory_config: (35, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,546,3,3,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1638 + d1 * 3 + d2, d3), memory_config: (52, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 3 : i32, 3 : i32] | tensor<[1,6,91,3,3,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1638 + d1 * 273 + d2 * 3 + d3, d4), memory_config: (52, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,546,3,3,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1638 + d1 * 3 + d2, d3), memory_config: (52, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 3 : i32, 3 : i32] | tensor<[1,6,91,3,3,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1638 + d1 * 273 + d2 * 3 + d3, d4), memory_config: (52, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,546,5,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2730 + d1 * 5 + d2, d3), memory_config: (86, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 5 : i32, 5 : i32] | tensor<[1,6,91,5,5,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 2730 + d1 * 455 + d2 * 5 + d3, d4), memory_config: (86, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,546,5,5,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2730 + d1 * 5 + d2, d3), memory_config: (86, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 6 : i32, 91 : i32, 5 : i32, 5 : i32] | tensor<[1,6,91,5,5,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 2730 + d1 * 455 + d2 * 5 + d3, d4), memory_config: (86, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32] | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32] | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1024 : i32] | tensor<[5,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1024 : i32] | tensor<[5,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 8, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1024 : i32, 256 : i32] | tensor<[5,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (160, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,5,1024,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 8, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1024 : i32, 256 : i32] | tensor<[5,1024,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (160, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1024,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1024 : i32, 32 : i32] | tensor<[5,1024,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.04 | 13.62 | +| ttnn.reshape | tensor<[1,5,1024,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5120 + d1 * 1024 + d2, d3), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1024 : i32, 32 : i32] | tensor<[5,1024,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (160, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1200,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 10, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1200 : i32, 300 : i32] | tensor<[5,1200,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (188, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 0.19 | +| ttnn.reshape | tensor<[1,5,1200,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 10, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1200 : i32, 300 : i32] | tensor<[5,1200,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (188, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1200 : i32, 64 : i32] | tensor<[5,1200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.1 | 10.62 | +| ttnn.reshape | tensor<[1,5,1200,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6000 + d1 * 1200 + d2, d3), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 1200 : i32, 64 : i32] | tensor<[5,1200,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1200 + d1, d2), memory_config: (188, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,16,16,2,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1280 + d1 * 256 + d2 * 16 + d3, d4), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 16 : i32, 32 : i32] | tensor<[1,5,16,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 16 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,16,16,2,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 1280 + d1 * 256 + d2 * 16 + d3, d4), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 16 : i32, 32 : i32] | tensor<[1,5,16,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 16 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 16 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32] | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,5,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 16 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32] | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,1,16,2,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 80 + d1 * 16 + d2 * 16 + d3, d4), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1 : i32, 32 : i32] | tensor<[1,5,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,1,16,2,f32]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 80 + d1 * 16 + d2 * 16 + d3, d4), memory_config: (3, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32, 1 : i32, 32 : i32] | tensor<[1,5,1,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 256 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 256 : i32, 32 : i32] | tensor<[5,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.04 | 2.12 | +| ttnn.reshape | tensor<[1,5,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 256 + d2, d3), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 256 : i32, 32 : i32] | tensor<[5,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (40, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1500 + d1 * 300 + d2, d3), memory_config: (47, 2, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 300 : i32, 64 : i32] | tensor<[5,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (47, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 1.12 | +| ttnn.reshape | tensor<[1,5,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1500 + d1 * 300 + d2, d3), memory_config: (47, 2, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 300 : i32, 64 : i32] | tensor<[5,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (47, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 4 : i32, 768 : i32] | tensor<[1,5,4,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20 + d1 * 4 + d2, d3), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,5,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 4 : i32, 768 : i32] | tensor<[1,5,4,768,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20 + d1 * 4 + d2, d3), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 32 + d2, d3), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 32 : i32, 256 : i32] | tensor<[5,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 4.69 | +| ttnn.reshape | tensor<[1,5,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 160 + d1 * 32 + d2, d3), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 32 : i32, 256 : i32] | tensor<[5,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (5, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 4096 : i32] | tensor<[5,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,5,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 4096 : i32] | tensor<[5,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,4,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20 + d1 * 4 + d2, d3), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 4 : i32, 4 : i32, 64 : i32] | tensor<[1,5,4,4,64,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 80 + d1 * 16 + d2 * 4 + d3, d4), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,5,4,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20 + d1 * 4 + d2, d3), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 4 : i32, 4 : i32, 64 : i32] | tensor<[1,5,4,4,64,bf16]> | mapping_from: (d0, d1, d2, d3, d4), mapping_to: (d0 * 80 + d1 * 16 + d2 * 4 + d3, d4), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,5,64,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 64 + d2, d3), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 64 : i32, 300 : i32] | tensor<[5,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 2.3 | +| ttnn.reshape | tensor<[1,5,64,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 64 + d2, d3), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [5 : i32, 64 : i32, 300 : i32] | tensor<[5,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,640,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 32 : i32, 32 : i32] | tensor<[1,640,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 32 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 28.88 | +| ttnn.reshape | tensor<[1,640,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 32 : i32, 32 : i32] | tensor<[1,640,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 32 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,640,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 160 : i32] | tensor<[1,640,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 1.23 | +| ttnn.reshape | tensor<[1,640,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 160 : i32] | tensor<[1,640,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,640,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 16 + d2, d3), memory_config: (320, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 20 : i32, 256 : i32] | tensor<[1,32,20,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.66 | 18.12 | +| ttnn.reshape | tensor<[1,640,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10240 + d1 * 16 + d2, d3), memory_config: (320, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 20 : i32, 256 : i32] | tensor<[1,32,20,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,640,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 32 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 20 : i32, 1024 : i32] | tensor<[1,32,20,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,640,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 32 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 20 : i32, 1024 : i32] | tensor<[1,32,20,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,640,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 32 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 1024 : i32] | tensor<[1,640,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,640,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 20480 + d1 * 32 + d2, d3), memory_config: (640, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 640 : i32, 1024 : i32] | tensor<[1,640,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 640 + d1, d2), memory_config: (20, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,640,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 64 + d2, d3), memory_config: (1280, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 20 : i32, 4096 : i32] | tensor<[1,32,20,4096,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,640,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 40960 + d1 * 64 + d2, d3), memory_config: (1280, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 20 : i32, 4096 : i32] | tensor<[1,32,20,4096,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 20 + d2, d3), memory_config: (20, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 120 + d2, d3), memory_config: (240, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 19200 : i32] | tensor<[1,64,19200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 600, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 120 + d2, d3), memory_config: (240, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 19200 : i32] | tensor<[1,64,19200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 600, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 8 : i32, 160 : i32] | tensor<[1,64,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 8 + d2, d3), memory_config: (16, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 8 : i32, 160 : i32] | tensor<[1,64,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 8 + d2, d3), memory_config: (16, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 8 : i32, 8 : i32, 1280 : i32] | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1280 : i32] | tensor<[64,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 1280 : i32] | tensor<[64,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,12,12,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 12 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9216 : i32] | tensor<[1,9216,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 288, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | inf | +| ttnn.reshape | tensor<[1,64,12,12,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 12 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9216 : i32] | tensor<[1,9216,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 288, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 15 + d2, d3), memory_config: (30, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 300 : i32] | tensor<[1,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | inf | +| ttnn.reshape | tensor<[1,64,15,20,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 15 + d2, d3), memory_config: (30, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 300 : i32] | tensor<[1,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 16 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 256 : i32] | tensor<[1,64,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 2.020426553593072e+38 | +| ttnn.reshape | tensor<[1,64,16,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 16 + d2, d3), memory_config: (32, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 256 : i32] | tensor<[1,64,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,19200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 600, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 120 : i32, 160 : i32] | tensor<[1,64,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 120 + d2, d3), memory_config: (240, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 10.62 | +| ttnn.reshape | tensor<[1,64,19200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 600, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 120 : i32, 160 : i32] | tensor<[1,64,120,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 7680 + d1 * 120 + d2, d3), memory_config: (240, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 1 : i32] | tensor<[1,64,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 1 : i32] | tensor<[1,64,1,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 256 : i32] | tensor<[1,64,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 0.68 | +| ttnn.reshape | tensor<[1,64,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 256 : i32] | tensor<[1,64,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 64 : i32] | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 9.88 | +| ttnn.reshape | tensor<[1,64,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 64 : i32, 64 : i32] | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,5120,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 160, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 5120 : i32] | tensor<[64,5120,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 160, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,5120,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 160, 'tile<32x32, bf16>', 'dram') | shape: [64 : i32, 5120 : i32] | tensor<[64,5120,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 160, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 320 : i32] | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 2.02 | +| ttnn.reshape | tensor<[1,64,64,320,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 320 : i32] | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 4096 : i32] | tensor<[1,64,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 4096 : i32] | tensor<[1,64,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 64 : i32, 9 : i32] | tensor<[64,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (128, 1, 'tile<32x32, f32>', 'dram') | yes | -0.01 | 3.05 | +| ttnn.reshape | tensor<[1,64,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4096 + d1 * 64 + d2, d3), memory_config: (128, 1, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 64 : i32, 9 : i32] | tensor<[64,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (128, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 8 + d2, d3), memory_config: (16, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1280 : i32] | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 8 + d2, d3), memory_config: (16, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1280 : i32] | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 9 + d2, d3), memory_config: (18, 2, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 9 : i32, 64 : i32] | tensor<[64,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (18, 2, 'tile<32x32, f32>', 'dram') | yes | 0.01 | 5.34 | +| ttnn.reshape | tensor<[1,64,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 9 + d2, d3), memory_config: (18, 2, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 9 : i32, 64 : i32] | tensor<[64,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (18, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,64,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 9 + d2, d3), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 9 : i32, 9 : i32] | tensor<[64,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,64,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 9 + d2, d3), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | shape: [64 : i32, 9 : i32, 9 : i32] | tensor<[64,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 6 : i32] | tensor<[1,6,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | -0.31 | nan | +| ttnn.reshape | tensor<[1,6,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 6 : i32] | tensor<[1,6,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1024 : i32] | tensor<[6,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1024 : i32] | tensor<[6,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 16 : i32, 64 : i32] | tensor<[1,6,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 16 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 16 : i32, 64 : i32] | tensor<[1,6,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 16 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1024 : i32] | tensor<[6,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,15,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 15 : i32, 15 : i32] | tensor<[6,15,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.0 | +| ttnn.reshape | tensor<[1,6,15,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 15 : i32, 15 : i32] | tensor<[6,15,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,15,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 15 : i32, 64 : i32] | tensor<[6,15,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 0.94 | +| ttnn.reshape | tensor<[1,6,15,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 90 + d1 * 15 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 15 : i32, 64 : i32] | tensor<[6,15,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 15 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 16 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1024 : i32] | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,6,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 96 + d1 * 16 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1024 : i32] | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 15 : i32] | tensor<[6,1,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 0.7 | +| ttnn.reshape | tensor<[1,6,1,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 15 : i32] | tensor<[6,1,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 1 : i32] | tensor<[6,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[1,6,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 1 : i32] | tensor<[6,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 64 : i32] | tensor<[6,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,6,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 6 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 1 : i32, 64 : i32] | tensor<[6,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 512 : i32] | tensor<[6,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,6,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 512 : i32] | tensor<[6,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,64,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 384 + d1 * 64 + d2, d3), memory_config: (12, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 64 : i32, 15 : i32] | tensor<[6,64,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (12, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 12.38 | +| ttnn.reshape | tensor<[1,6,64,15,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 384 + d1 * 64 + d2, d3), memory_config: (12, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 64 : i32, 15 : i32] | tensor<[6,64,15,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (12, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,6,64,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 384 + d1 * 64 + d2, d3), memory_config: (12, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 64 : i32, 1 : i32] | tensor<[6,64,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (12, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.04 | 5.62 | +| ttnn.reshape | tensor<[1,6,64,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 384 + d1 * 64 + d2, d3), memory_config: (12, 1, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 64 : i32, 1 : i32] | tensor<[6,64,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (12, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,71,64,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4544 + d1 * 64 + d2, d3), memory_config: (142, 1, 'tile<32x32, f32>', 'dram') | shape: [71 : i32, 64 : i32, 7 : i32] | tensor<[71,64,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (142, 1, 'tile<32x32, f32>', 'dram') | yes | -0.0 | nan | +| ttnn.reshape | tensor<[1,71,64,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4544 + d1 * 64 + d2, d3), memory_config: (142, 1, 'tile<32x32, f32>', 'dram') | shape: [71 : i32, 64 : i32, 7 : i32] | tensor<[71,64,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (142, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,71,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 71 : i32, 7 : i32, 64 : i32] | tensor<[1,71,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.23 | 18.25 | +| ttnn.reshape | tensor<[1,71,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 71 : i32, 7 : i32, 64 : i32] | tensor<[1,71,7,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,71,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | shape: [71 : i32, 7 : i32, 64 : i32] | tensor<[71,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | yes | 0.21 | 8.93 | +| ttnn.reshape | tensor<[1,71,7,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | shape: [71 : i32, 7 : i32, 64 : i32] | tensor<[71,7,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,71,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [71 : i32, 7 : i32, 7 : i32] | tensor<[71,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,71,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 497 + d1 * 7 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [71 : i32, 7 : i32, 7 : i32] | tensor<[71,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,768,12,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9216 + d1 * 12 + d2, d3), memory_config: (288, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 192 : i32] | tensor<[1,768,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 6, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,768,12,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9216 + d1 * 12 + d2, d3), memory_config: (288, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 192 : i32] | tensor<[1,768,192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 6, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,768,144,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 12 : i32, 12 : i32] | tensor<[1,768,12,12,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9216 + d1 * 12 + d2, d3), memory_config: (288, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 3.03 | +| ttnn.reshape | tensor<[1,768,144,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 12 : i32, 12 : i32] | tensor<[1,768,12,12,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 9216 + d1 * 12 + d2, d3), memory_config: (288, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,768,14,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10752 + d1 * 14 + d2, d3), memory_config: (336, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 196 : i32] | tensor<[1,768,196,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 7, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 1.7014118346046923e+38 | +| ttnn.reshape | tensor<[1,768,14,14,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 10752 + d1 * 14 + d2, d3), memory_config: (336, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 196 : i32] | tensor<[1,768,196,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 7, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,768,7,7,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5376 + d1 * 7 + d2, d3), memory_config: (168, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 49 : i32] | tensor<[1,768,49,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 2, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,768,7,7,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 5376 + d1 * 7 + d2, d3), memory_config: (168, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 768 : i32, 49 : i32] | tensor<[1,768,49,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,768,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 64 : i32, 8 : i32] | tensor<[1,12,64,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | inf | +| ttnn.reshape | tensor<[1,768,8,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 768 + d1, d2), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 12 : i32, 64 : i32, 8 : i32] | tensor<[1,12,64,8,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 768 + d1 * 64 + d2, d3), memory_config: (24, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 7 : i32] | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | -0.3 | nan | +| ttnn.reshape | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 7 : i32] | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 7 : i32] | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 1.0 | nan | +| ttnn.reshape | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [1 : i32, 7 : i32] | tensor<[1,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 12 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 768 : i32] | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,7,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 12 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 768 : i32] | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,18176,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 568, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 18176 : i32] | tensor<[7,18176,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 568, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,7,18176,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 568, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 18176 : i32] | tensor<[7,18176,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 568, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 3072 : i32] | tensor<[7,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,7,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 3072 : i32] | tensor<[7,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,4544,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 4544 : i32] | tensor<[7,4544,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 142, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,4672,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 146, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 73 : i32, 64 : i32] | tensor<[1,7,73,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 511 + d1 * 73 + d2, d3), memory_config: (16, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,7,4672,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 146, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 73 : i32, 64 : i32] | tensor<[1,7,73,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 511 + d1 * 73 + d2, d3), memory_config: (16, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 768 : i32] | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 768 : i32] | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 768 : i32] | tensor<[7,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 768 : i32] | tensor<[7,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 12 : i32, 64 : i32] | tensor<[1,7,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 12 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.22 | 7.34 | +| ttnn.reshape | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 12 : i32, 64 : i32] | tensor<[1,7,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 84 + d1 * 12 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [7 : i32, 768 : i32] | tensor<[7,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1024,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 32, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 1024 : i32, 1024 : i32] | tensor<[8,1024,1024,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 32, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,1024,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 32, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 1024 : i32, 1024 : i32] | tensor<[8,1024,1024,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 32, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1024,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 3, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 1024 : i32, 80 : i32] | tensor<[8,1024,80,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 3, 'tile<32x32, f32>', 'dram') | yes | 0.11 | 1.72 | +| ttnn.reshape | tensor<[1,8,1024,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 3, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 1024 : i32, 80 : i32] | tensor<[8,1024,80,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1024,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 1024 : i32, 9 : i32] | tensor<[8,1024,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,1024,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 1024 : i32, 9 : i32] | tensor<[8,1024,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 10 : i32, 10 : i32] | tensor<[8,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.0 | +| ttnn.reshape | tensor<[1,8,10,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 10 : i32, 10 : i32] | tensor<[8,10,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 10 : i32, 64 : i32] | tensor<[8,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 0.83 | +| ttnn.reshape | tensor<[1,8,10,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 80 + d1 * 10 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 10 : i32, 64 : i32] | tensor<[8,10,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 10 + d1, d2), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,160,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 160 + d2, d3), memory_config: (40, 8, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 160 : i32, 256 : i32] | tensor<[8,160,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (40, 8, 'tile<32x32, f32>', 'dram') | yes | -0.0 | 1.52 | +| ttnn.reshape | tensor<[1,8,160,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 160 + d2, d3), memory_config: (40, 8, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 160 : i32, 256 : i32] | tensor<[8,160,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (40, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,160,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 160 + d2, d3), memory_config: (40, 2, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 160 : i32, 64 : i32] | tensor<[8,160,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (40, 2, 'tile<32x32, f32>', 'dram') | yes | -0.01 | 1.0 | +| ttnn.reshape | tensor<[1,8,160,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 160 + d2, d3), memory_config: (40, 2, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 160 : i32, 64 : i32] | tensor<[8,160,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (40, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,160,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 160 + d2, d3), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 160 : i32, 9 : i32] | tensor<[8,160,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | yes | -0.0 | 3.71 | +| ttnn.reshape | tensor<[1,8,160,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1280 + d1 * 160 + d2, d3), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 160 : i32, 9 : i32] | tensor<[8,160,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 160 + d1, d2), memory_config: (40, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 10 : i32] | tensor<[8,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.14 | 0.98 | +| ttnn.reshape | tensor<[1,8,1,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 10 : i32] | tensor<[8,1,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 1 : i32] | tensor<[8,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 1.0 | +| ttnn.reshape | tensor<[1,8,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 1 : i32] | tensor<[8,1,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 64 : i32] | tensor<[8,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,1,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 64 : i32] | tensor<[8,1,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,1,920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 920 : i32] | tensor<[8,1,920,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.22 | +| ttnn.reshape | tensor<[1,8,1,920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8 + d1 + d2, d3), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 1 : i32, 920 : i32] | tensor<[8,1,920,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,2048,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 2048 : i32, 160 : i32] | tensor<[8,2048,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 1.03 | +| ttnn.reshape | tensor<[1,8,2048,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 2048 : i32, 160 : i32] | tensor<[8,2048,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,2048,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 2048 : i32, 256 : i32] | tensor<[8,2048,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,2048,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 2048 : i32, 256 : i32] | tensor<[8,2048,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,2048,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 2048 : i32, 32 : i32] | tensor<[8,2048,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.05 | 48.0 | +| ttnn.reshape | tensor<[1,8,2048,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 16384 + d1 * 2048 + d2, d3), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 2048 : i32, 32 : i32] | tensor<[8,2048,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (512, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,256,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 5, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 256 : i32, 160 : i32] | tensor<[8,256,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 5, 'tile<32x32, f32>', 'dram') | yes | 0.09 | 1.28 | +| ttnn.reshape | tensor<[1,8,256,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 5, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 256 : i32, 160 : i32] | tensor<[8,256,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 5, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,256,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 64, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 256 : i32, 2048 : i32] | tensor<[8,256,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,256,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 64, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 256 : i32, 2048 : i32] | tensor<[8,256,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,256,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 8, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 256 : i32, 256 : i32] | tensor<[8,256,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,256,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 8, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 256 : i32, 256 : i32] | tensor<[8,256,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 256 : i32, 32 : i32] | tensor<[8,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.02 | 24.5 | +| ttnn.reshape | tensor<[1,8,256,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 256 : i32, 32 : i32] | tensor<[8,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,256,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 3, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 256 : i32, 96 : i32] | tensor<[8,256,96,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.03 | 3.56 | +| ttnn.reshape | tensor<[1,8,256,96,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 3, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 256 : i32, 96 : i32] | tensor<[8,256,96,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,256,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 256 : i32, 9 : i32] | tensor<[8,256,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,256,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 256 : i32, 9 : i32] | tensor<[8,256,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,300,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 10, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 300 : i32, 300 : i32] | tensor<[8,300,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (75, 10, 'tile<32x32, bf16>', 'dram') | yes | 0.01 | 1.0 | +| ttnn.reshape | tensor<[1,8,300,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 10, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 300 : i32, 300 : i32] | tensor<[8,300,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (75, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 300 : i32, 64 : i32] | tensor<[8,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | yes | 0.11 | 7.22 | +| ttnn.reshape | tensor<[1,8,300,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2400 + d1 * 300 + d2, d3), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 300 : i32, 64 : i32] | tensor<[8,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (75, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,32,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 32 + d2, d3), memory_config: (8, 64, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 32 : i32, 2048 : i32] | tensor<[8,32,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (8, 64, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 4.31 | +| ttnn.reshape | tensor<[1,8,32,2048,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 32 + d2, d3), memory_config: (8, 64, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 32 : i32, 2048 : i32] | tensor<[8,32,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (8, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 32 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 32 : i32, 256 : i32] | tensor<[8,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 10.38 | +| ttnn.reshape | tensor<[1,8,32,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 32 + d2, d3), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 32 : i32, 256 : i32] | tensor<[8,32,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,4096,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 128, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 4096 : i32, 4096 : i32] | tensor<[8,4096,4096,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 128, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,4096,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 128, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 4096 : i32, 4096 : i32] | tensor<[8,4096,4096,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 128, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,4096,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 2, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 4096 : i32, 40 : i32] | tensor<[8,4096,40,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 2, 'tile<32x32, f32>', 'dram') | yes | 0.1 | 2.5 | +| ttnn.reshape | tensor<[1,8,4096,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 2, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 4096 : i32, 40 : i32] | tensor<[8,4096,40,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,4096,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 4096 : i32, 9 : i32] | tensor<[8,4096,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,4096,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 4096 : i32, 9 : i32] | tensor<[8,4096,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,40,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 40 + d2, d3), memory_config: (10, 128, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 40 : i32, 4096 : i32] | tensor<[8,40,4096,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 40 + d1, d2), memory_config: (10, 128, 'tile<32x32, f32>', 'dram') | yes | -0.0 | 3.01 | +| ttnn.reshape | tensor<[1,8,40,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 40 + d2, d3), memory_config: (10, 128, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 40 : i32, 4096 : i32] | tensor<[8,40,4096,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 40 + d1, d2), memory_config: (10, 128, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,40,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 40 + d2, d3), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 40 : i32, 9 : i32] | tensor<[8,40,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 40 + d1, d2), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | yes | -0.01 | 6.65 | +| ttnn.reshape | tensor<[1,8,40,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 320 + d1 * 40 + d2, d3), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 40 : i32, 9 : i32] | tensor<[8,40,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 40 + d1, d2), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,64,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 64 : i32, 10 : i32] | tensor<[8,64,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.03 | 8.56 | +| ttnn.reshape | tensor<[1,8,64,10,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 64 : i32, 10 : i32] | tensor<[8,64,10,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,64,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 5, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 64 : i32, 160 : i32] | tensor<[8,64,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 5, 'tile<32x32, f32>', 'dram') | yes | 0.11 | 1.0 | +| ttnn.reshape | tensor<[1,8,64,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 5, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 64 : i32, 160 : i32] | tensor<[8,64,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 5, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,64,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 64 : i32, 1 : i32] | tensor<[8,64,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.07 | 7.788445287802241e+34 | +| ttnn.reshape | tensor<[1,8,64,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 64 : i32, 1 : i32] | tensor<[8,64,1,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,64,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 10, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 64 : i32, 300 : i32] | tensor<[8,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 10, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 6.44 | +| ttnn.reshape | tensor<[1,8,64,300,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 10, 'tile<32x32, bf16>', 'dram') | shape: [8 : i32, 64 : i32, 300 : i32] | tensor<[8,64,300,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 64 : i32, 64 : i32] | tensor<[8,64,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 64 : i32, 64 : i32] | tensor<[8,64,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 64 : i32, 9 : i32] | tensor<[8,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,8,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 64 : i32, 9 : i32] | tensor<[8,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,80,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 80 + d2, d3), memory_config: (20, 32, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 80 : i32, 1024 : i32] | tensor<[8,80,1024,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 80 + d1, d2), memory_config: (20, 32, 'tile<32x32, f32>', 'dram') | yes | 0.0 | 2.14 | +| ttnn.reshape | tensor<[1,8,80,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 80 + d2, d3), memory_config: (20, 32, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 80 : i32, 1024 : i32] | tensor<[8,80,1024,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 80 + d1, d2), memory_config: (20, 32, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,80,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 80 + d2, d3), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 80 : i32, 9 : i32] | tensor<[8,80,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 80 + d1, d2), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | yes | 0.01 | 4.12 | +| ttnn.reshape | tensor<[1,8,80,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 640 + d1 * 80 + d2, d3), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 80 : i32, 9 : i32] | tensor<[8,80,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 80 + d1, d2), memory_config: (20, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1280 : i32] | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | yes | -0.0 | 12.56 | +| ttnn.reshape | tensor<[1,8,8,1280,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 * 8 + d2, d3), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1280 : i32] | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,9,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 9 + d2, d3), memory_config: (3, 5, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 9 : i32, 160 : i32] | tensor<[8,9,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (3, 5, 'tile<32x32, f32>', 'dram') | yes | 0.06 | 6.3 | +| ttnn.reshape | tensor<[1,8,9,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 9 + d2, d3), memory_config: (3, 5, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 9 : i32, 160 : i32] | tensor<[8,9,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (3, 5, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,9,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 9 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 9 : i32, 40 : i32] | tensor<[8,9,40,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | yes | 0.05 | 5.01 | +| ttnn.reshape | tensor<[1,8,9,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 9 + d2, d3), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 9 : i32, 40 : i32] | tensor<[8,9,40,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (3, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,8,9,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 9 + d2, d3), memory_config: (3, 3, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 9 : i32, 80 : i32] | tensor<[8,9,80,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (3, 3, 'tile<32x32, f32>', 'dram') | yes | 0.09 | 8.02 | +| ttnn.reshape | tensor<[1,8,9,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 9 + d2, d3), memory_config: (3, 3, 'tile<32x32, f32>', 'dram') | shape: [8 : i32, 9 : i32, 80 : i32] | tensor<[8,9,80,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (3, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,920,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 920 : i32] | tensor<[1,1,1,920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,920,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 1 : i32, 1 : i32, 920 : i32] | tensor<[1,1,1,920,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 + d1 + d2, d3), memory_config: (1, 29, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,960,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 32 + d2, d3), memory_config: (960, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 30 : i32, 1024 : i32] | tensor<[1,32,30,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 30 + d2, d3), memory_config: (30, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,960,32,32,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 30720 + d1 * 32 + d2, d3), memory_config: (960, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 30 : i32, 1024 : i32] | tensor<[1,32,30,1024,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 30 + d2, d3), memory_config: (30, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,960,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 61440 + d1 * 64 + d2, d3), memory_config: (1920, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 30 : i32, 4096 : i32] | tensor<[1,32,30,4096,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 30 + d2, d3), memory_config: (30, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,960,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 61440 + d1 * 64 + d2, d3), memory_config: (1920, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 30 : i32, 4096 : i32] | tensor<[1,32,30,4096,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 960 + d1 * 30 + d2, d3), memory_config: (30, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 16 : i32, 64 : i32] | tensor<[1,9,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 16 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 16 : i32, 64 : i32] | tensor<[1,9,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 16 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 1024 : i32] | tensor<[9,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 1024 : i32] | tensor<[9,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 8 : i32, 160 : i32] | tensor<[1,9,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 8 + d2, d3), memory_config: (3, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 8 : i32, 160 : i32] | tensor<[1,9,8,160,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 8 + d2, d3), memory_config: (3, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 128 : i32] | tensor<[9,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 128 : i32] | tensor<[9,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 768 : i32] | tensor<[1,9,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 768 : i32] | tensor<[1,9,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 16384 : i32] | tensor<[9,16384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,9,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 16384 : i32] | tensor<[9,16384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,16,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 16 + d2, d3), memory_config: (5, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 2048 : i32] | tensor<[1,9,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,16,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 16 + d2, d3), memory_config: (5, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 2048 : i32] | tensor<[1,9,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 16 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 1024 : i32] | tensor<[1,9,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,16,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 16 + d2, d3), memory_config: (5, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 1024 : i32] | tensor<[1,9,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 16 : i32, 128 : i32] | tensor<[1,9,16,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 16 + d2, d3), memory_config: (5, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 16 : i32, 128 : i32] | tensor<[1,9,16,128,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 144 + d1 * 16 + d2, d3), memory_config: (5, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 2048 : i32] | tensor<[9,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 2048 : i32] | tensor<[9,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 3072 : i32] | tensor<[9,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 3072 : i32] | tensor<[9,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 8 : i32, 40 : i32] | tensor<[1,9,8,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 8 : i32, 40 : i32] | tensor<[1,9,8,40,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 8 + d2, d3), memory_config: (3, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 64 : i32, 64 : i32] | tensor<[1,9,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 64 + d2, d3), memory_config: (18, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 64 : i32, 64 : i32] | tensor<[1,9,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 64 + d2, d3), memory_config: (18, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 4096 : i32] | tensor<[9,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 4096 : i32] | tensor<[9,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 8 : i32, 80 : i32] | tensor<[1,9,8,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 8 + d2, d3), memory_config: (3, 3, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,640,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 20, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 8 : i32, 80 : i32] | tensor<[1,9,8,80,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 72 + d1 * 8 + d2, d3), memory_config: (3, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 64 + d2, d3), memory_config: (18, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 4096 : i32] | tensor<[1,9,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,64,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 64 + d2, d3), memory_config: (18, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 4096 : i32] | tensor<[1,9,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 12 : i32, 64 : i32] | tensor<[1,9,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 12 : i32, 64 : i32] | tensor<[1,9,12,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 108 + d1 * 12 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 768 : i32] | tensor<[9,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[1,9,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 768 : i32] | tensor<[9,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[1,9,8192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 256, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 8192 : i32] | tensor<[9,8192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 256, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[1,9,8192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 256, 'tile<32x32, bf16>', 'dram') | shape: [9 : i32, 8192 : i32] | tensor<[9,8192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 256, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[201,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 201 : i32, 3072 : i32] | tensor<[1,201,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[201,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 201 : i32, 3072 : i32] | tensor<[1,201,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[201,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 201 : i32, 768 : i32] | tensor<[1,201,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[201,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 201 : i32, 768 : i32] | tensor<[1,201,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 201 + d1, d2), memory_config: (7, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2048,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 1 : i32, 1 : i32] | tensor<[1,2048,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2048,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 1 : i32, 1 : i32] | tensor<[1,2048,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 + d2, d3), memory_config: (64, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2048,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 1280 : i32] | tensor<[1,2048,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 40, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2048,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 1280 : i32] | tensor<[1,2048,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2048,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 256 : i32] | tensor<[1,2048,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2048,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 256 : i32] | tensor<[1,2048,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2048,262,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 262 : i32] | tensor<[1,2048,262,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 9, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 32.25 | +| ttnn.reshape | tensor<[2048,262,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 9, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 262 : i32] | tensor<[1,2048,262,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 9, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2048,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 768 : i32] | tensor<[1,2048,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2048,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 2048 : i32, 768 : i32] | tensor<[1,2048,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2048 + d1, d2), memory_config: (64, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[20,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [20 : i32, 1 : i32] | tensor<[20,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[20,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32] | tensor<[1,20,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[20,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 20 : i32] | tensor<[1,20,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[240,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, f32>', 'dram') | shape: [240 : i32, 1 : i32] | tensor<[240,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1 : i32, 1 : i32] | tensor<[1,256,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1 : i32, 1 : i32] | tensor<[1,256,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 + d2, d3), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,10240,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 320, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 10240 : i32] | tensor<[1,256,10240,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 320, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,10240,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 320, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 10240 : i32] | tensor<[1,256,10240,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 320, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1024 : i32] | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1024 : i32] | tensor<[1,256,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 1280 : i32] | tensor<[1,256,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,160,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 160 : i32] | tensor<[1,256,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,160,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 160 : i32] | tensor<[1,256,160,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 5, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 256 : i32] | tensor<[1,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 2 : i32] | tensor<[1,256,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | -0.01 | 2.050957259121257e+35 | +| ttnn.reshape | tensor<[256,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 2 : i32] | tensor<[1,256,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 32 : i32] | tensor<[1,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,32,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 32 : i32] | tensor<[1,256,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 4096 : i32] | tensor<[1,256,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 4096 : i32] | tensor<[1,256,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 512 : i32] | tensor<[1,256,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 512 : i32] | tensor<[1,256,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 64 : i32] | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 64 : i32] | tensor<[1,256,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[256,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 768 : i32] | tensor<[1,256,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[256,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 256 : i32, 768 : i32] | tensor<[1,256,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (8, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[25,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [150 : i32, 2 : i32] | tensor<[150,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[25,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [150 : i32, 2 : i32] | tensor<[150,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (5, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[25,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 2 : i32] | tensor<[1,25,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | yes | 0.18 | 12.06 | +| ttnn.reshape | tensor<[25,2,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 2 : i32] | tensor<[1,25,2,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[25,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 3072 : i32] | tensor<[1,25,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[25,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 3072 : i32] | tensor<[1,25,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[25,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 768 : i32] | tensor<[1,25,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[25,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 25 : i32, 768 : i32] | tensor<[1,25,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 25 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[27,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [27 : i32, 1 : i32] | tensor<[27,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[27,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [27 : i32, 1 : i32] | tensor<[27,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [2 : i32, 1 : i32] | tensor<[2,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [2 : i32, 1 : i32] | tensor<[2,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32] | tensor<[1,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 2 : i32] | tensor<[1,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,14,14,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 196 : i32] | tensor<[2,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | yes | 0.04 | nan | +| ttnn.reshape | tensor<[2,14,14,i32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 14 + d1, d2), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 196 : i32] | tensor<[2,196,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 7, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 7 : i32] | tensor<[2,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.2 | nan | +| ttnn.reshape | tensor<[2,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [2 : i32, 7 : i32] | tensor<[2,7,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,7,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 2048 : i32] | tensor<[14,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,7,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 2048 : i32] | tensor<[14,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,7,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 512 : i32] | tensor<[14,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,7,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [14 : i32, 512 : i32] | tensor<[14,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,7,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 7 : i32, 8 : i32, 64 : i32] | tensor<[2,7,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 8 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,7,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 7 : i32, 8 : i32, 64 : i32] | tensor<[2,7,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 8 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,7,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 8 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 7 : i32, 512 : i32] | tensor<[2,7,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,7,8,64,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 8 + d2, d3), memory_config: (4, 2, 'tile<32x32, bf16>', 'dram') | shape: [2 : i32, 7 : i32, 512 : i32] | tensor<[2,7,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[2,8,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 7 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 7 : i32, 7 : i32] | tensor<[16,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[2,8,7,7,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 56 + d1 * 7 + d2, d3), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | shape: [16 : i32, 7 : i32, 7 : i32] | tensor<[16,7,7,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (4, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[300,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 128 : i32] | tensor<[1,300,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[300,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 128 : i32] | tensor<[1,300,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[300,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 2048 : i32] | tensor<[1,300,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[300,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 2048 : i32] | tensor<[1,300,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[300,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 320 : i32] | tensor<[1,300,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[300,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 320 : i32] | tensor<[1,300,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[300,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 512 : i32] | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[300,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 512 : i32] | tensor<[1,300,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[300,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 64 : i32] | tensor<[1,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[300,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 300 : i32, 64 : i32] | tensor<[1,300,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 300 + d1, d2), memory_config: (10, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[30,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [30 : i32, 1 : i32] | tensor<[30,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[30,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [30 : i32, 1 : i32] | tensor<[30,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[320,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 10, 'tile<32x32, f32>', 'dram') | shape: [320 : i32, 1 : i32] | tensor<[320,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (10, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,1,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 4 : i32] | tensor<[3234,4,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[3234,1,4,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 4 : i32] | tensor<[3234,4,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3234,2,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2 + d1, d2), memory_config: (203, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 4 : i32] | tensor<[3234,4,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[3234,2,2,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 2 + d1, d2), memory_config: (203, 1, 'tile<32x32, f32>', 'dram') | shape: [3234 : i32, 4 : i32] | tensor<[3234,4,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (102, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [32 : i32, 1 : i32] | tensor<[32,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.42 | nan | +| ttnn.reshape | tensor<[32,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [32 : i32, 1 : i32] | tensor<[32,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,1536,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 1536 : i32] | tensor<[1,32,1536,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,1536,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 1536 : i32] | tensor<[1,32,1536,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 48, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,32,128,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (32, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 128 : i32] | tensor<[1,32,32,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,32,128,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (32, 4, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 128 : i32] | tensor<[1,32,32,128,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 4, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,32,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 32 : i32] | tensor<[1,32,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,32,32,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 32 : i32, 32 : i32, 32 : i32] | tensor<[1,32,32,32,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 1024 + d1 * 32 + d2, d3), memory_config: (32, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,4608,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 144, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 4608 : i32] | tensor<[1,32,4608,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 144, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,4608,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 144, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 4608 : i32] | tensor<[1,32,4608,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 144, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[32,6144,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 192, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 6144 : i32] | tensor<[1,32,6144,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 192, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[32,6144,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 192, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 32 : i32, 6144 : i32] | tensor<[1,32,6144,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 32 + d1, d2), memory_config: (1, 192, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[38809,12,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1213, 1, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 197 : i32, 12 : i32] | tensor<[197,197,12,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (1213, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[38809,12,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1213, 1, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 197 : i32, 12 : i32] | tensor<[197,197,12,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (1213, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[38809,16,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1213, 1, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 197 : i32, 16 : i32] | tensor<[197,197,16,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (1213, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[38809,16,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1213, 1, 'tile<32x32, bf16>', 'dram') | shape: [197 : i32, 197 : i32, 16 : i32] | tensor<[197,197,16,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 197 + d1, d2), memory_config: (1213, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [3 : i32, 1 : i32] | tensor<[3,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32] | tensor<[1,3,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[3,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32] | tensor<[1,3,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,1445,1445,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (136, 46, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 1445 : i32, 1445 : i32] | tensor<[1,3,1445,1445,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 1445 + d2, d3), memory_config: (136, 46, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[3,1445,1445,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (136, 46, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 1445 : i32, 1445 : i32] | tensor<[1,3,1445,1445,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 1445 + d2, d3), memory_config: (136, 46, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[3,1445,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (136, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 1445 : i32, 64 : i32] | tensor<[1,3,1445,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 1445 + d2, d3), memory_config: (136, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.01 | +| ttnn.reshape | tensor<[3,1445,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1445 + d1, d2), memory_config: (136, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 3 : i32, 1445 : i32, 64 : i32] | tensor<[1,3,1445,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 4335 + d1 * 1445 + d2, d3), memory_config: (136, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[400,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (13, 1, 'tile<32x32, f32>', 'dram') | shape: [2400 : i32, 2 : i32] | tensor<[2400,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (75, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[400,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (13, 1, 'tile<32x32, f32>', 'dram') | shape: [2400 : i32, 2 : i32] | tensor<[2400,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (75, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4096,2560,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 2560 : i32] | tensor<[1,4096,2560,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 80, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[4096,2560,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 80, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 2560 : i32] | tensor<[1,4096,2560,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 80, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4096,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 256 : i32] | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[4096,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 256 : i32] | tensor<[1,4096,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4096,320,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 320 : i32] | tensor<[1,4096,320,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 10, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4096,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 64 : i32] | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[4096,64,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4096 : i32, 64 : i32] | tensor<[1,4096,64,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (128, 2, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4800,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 128 : i32] | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[4800,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 128 : i32] | tensor<[1,4800,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4800,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 512 : i32] | tensor<[1,4800,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[4800,512,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 4800 : i32, 512 : i32] | tensor<[1,4800,512,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4800 + d1, d2), memory_config: (150, 16, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[480,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 15, 'tile<32x32, f32>', 'dram') | shape: [480 : i32, 1 : i32] | tensor<[480,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (15, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[4,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 2 : i32] | tensor<[24,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[4,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [24 : i32, 2 : i32] | tensor<[24,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[50,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 50 : i32, 3072 : i32] | tensor<[1,50,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[50,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 50 : i32, 3072 : i32] | tensor<[1,50,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[50,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 50 : i32, 768 : i32] | tensor<[1,50,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[50,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 50 : i32, 768 : i32] | tensor<[1,50,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 50 + d1, d2), memory_config: (2, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[512,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 1 : i32, 1 : i32] | tensor<[1,512,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[512,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 16, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 512 : i32, 1 : i32, 1 : i32] | tensor<[1,512,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 + d2, d3), memory_config: (16, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [5 : i32, 1 : i32] | tensor<[5,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[5,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [5 : i32, 1 : i32] | tensor<[5,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32] | tensor<[1,5,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[5,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 5 : i32] | tensor<[1,5,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 1024 : i32] | tensor<[1,5,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 4096 : i32] | tensor<[1,5,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[5,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 4096 : i32] | tensor<[1,5,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[5,51200,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1600, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 51200 : i32] | tensor<[1,5,51200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 1600, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[5,51200,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1600, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 5 : i32, 51200 : i32] | tensor<[1,5,51200,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 5 + d1, d2), memory_config: (1, 1600, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[60,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [60 : i32, 1 : i32] | tensor<[60,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[60,f32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, f32>', 'dram') | shape: [60 : i32, 1 : i32] | tensor<[60,1,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1 : i32, 1 : i32] | tensor<[1,64,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 + d2, d3), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[64,bf16]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 2, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1 : i32, 1 : i32] | tensor<[1,64,1,1,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 64 + d1 + d2, d3), memory_config: (2, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,10240,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 320, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 10240 : i32] | tensor<[1,64,10240,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 320, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[64,10240,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 320, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 10240 : i32] | tensor<[1,64,10240,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 320, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,1280,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 64 : i32, 1280 : i32] | tensor<[1,64,1280,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (2, 40, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (18, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 9 : i32, 64 : i32] | tensor<[1,64,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 9 + d2, d3), memory_config: (18, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[64,9,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (18, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 9 : i32, 64 : i32] | tensor<[1,64,9,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 9 + d2, d3), memory_config: (18, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[64,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 9 : i32, 9 : i32] | tensor<[1,64,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 9 + d2, d3), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[64,9,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 64 : i32, 9 : i32, 9 : i32] | tensor<[1,64,9,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 576 + d1 * 9 + d2, d3), memory_config: (18, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [6 : i32, 1 : i32] | tensor<[6,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.08 | nan | +| ttnn.reshape | tensor<[6,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [6 : i32, 1 : i32] | tensor<[6,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 6 : i32, 1024 : i32] | tensor<[1,6,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 6 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 100 : i32, 256 : i32] | tensor<[6,100,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 100 + d1, d2), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,100,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 100 + d1 * 100 + d2, d3), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | shape: [600 : i32, 256 : i32] | tensor<[600,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (19, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 256 + d2, d3), memory_config: (48, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 256 : i32, 256 : i32] | tensor<[6,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (48, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[6,1,256,256,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 256 + d2, d3), memory_config: (48, 8, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 256 : i32, 256 : i32] | tensor<[6,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (48, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[6,1,256,92,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 256 + d2, d3), memory_config: (48, 3, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 256 : i32, 92 : i32] | tensor<[6,256,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (48, 3, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 4.03 | +| ttnn.reshape | tensor<[6,1,256,92,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 256 + d1 * 256 + d2, d3), memory_config: (48, 3, 'tile<32x32, bf16>', 'dram') | shape: [6 : i32, 256 : i32, 92 : i32] | tensor<[6,256,92,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (48, 3, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[729,12,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 27 : i32, 27 : i32, 12 : i32] | tensor<[1,27,27,12,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 729 + d1 * 27 + d2, d3), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[729,12,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 27 : i32, 27 : i32, 12 : i32] | tensor<[1,27,27,12,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 729 + d1 * 27 + d2, d3), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[729,16,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 27 : i32, 27 : i32, 16 : i32] | tensor<[1,27,27,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 729 + d1 * 27 + d2, d3), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[729,16,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 27 : i32, 27 : i32, 16 : i32] | tensor<[1,27,27,16,bf16]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 729 + d1 * 27 + d2, d3), memory_config: (23, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [7 : i32, 1 : i32] | tensor<[7,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.64 | nan | +| ttnn.reshape | tensor<[7,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [7 : i32, 1 : i32] | tensor<[7,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [7 : i32, 1 : i32] | tensor<[7,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | yes | 0.48 | nan | +| ttnn.reshape | tensor<[7,i32]> | mapping_from: (d0), mapping_to: (0, d0), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | shape: [7 : i32, 1 : i32] | tensor<[7,1,i32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, u32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,2304,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 72, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 2304 : i32] | tensor<[1,7,2304,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 72, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[7,2304,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 72, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 2304 : i32] | tensor<[1,7,2304,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 72, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 3072 : i32] | tensor<[1,7,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[7,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 3072 : i32] | tensor<[1,7,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[7,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 768 : i32] | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[7,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 7 : i32, 768 : i32] | tensor<[1,7,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 7 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,1024,1024,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1024 : i32, 1024 : i32] | tensor<[1,8,1024,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 32, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,1024,1024,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 32, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1024 : i32, 1024 : i32] | tensor<[1,8,1024,1024,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 32, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,1024,80,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1024 : i32, 80 : i32] | tensor<[1,8,1024,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 3, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,1024,80,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 3, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1024 : i32, 80 : i32] | tensor<[1,8,1024,80,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 3, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,1024,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1024 : i32, 9 : i32] | tensor<[1,8,1024,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,1024,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 1024 + d1, d2), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 1024 : i32, 9 : i32] | tensor<[1,8,1024,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 8192 + d1 * 1024 + d2, d3), memory_config: (256, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,256,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 5, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 160 : i32] | tensor<[1,8,256,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 5, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,256,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 5, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 160 : i32] | tensor<[1,8,256,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 5, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,256,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 256 : i32] | tensor<[1,8,256,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 8, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,256,256,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 8, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 256 : i32] | tensor<[1,8,256,256,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 8, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,256,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 9 : i32] | tensor<[1,8,256,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,256,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 256 : i32, 9 : i32] | tensor<[1,8,256,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 2048 + d1 * 256 + d2, d3), memory_config: (64, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,4096,4096,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 4096 : i32, 4096 : i32] | tensor<[1,8,4096,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 128, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,4096,4096,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 128, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 4096 : i32, 4096 : i32] | tensor<[1,8,4096,4096,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 128, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,4096,40,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 4096 : i32, 40 : i32] | tensor<[1,8,4096,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,4096,40,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 4096 : i32, 40 : i32] | tensor<[1,8,4096,40,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,4096,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 4096 : i32, 9 : i32] | tensor<[1,8,4096,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,4096,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 4096 + d1, d2), memory_config: (1024, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 4096 : i32, 9 : i32] | tensor<[1,8,4096,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 32768 + d1 * 4096 + d2, d3), memory_config: (1024, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,64,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 5, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 64 : i32, 160 : i32] | tensor<[1,8,64,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 5, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,64,160,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 5, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 64 : i32, 160 : i32] | tensor<[1,8,64,160,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 5, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,64,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 64 : i32, 64 : i32] | tensor<[1,8,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,64,64,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 64 : i32, 64 : i32] | tensor<[1,8,64,64,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 2, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[8,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 64 : i32, 9 : i32] | tensor<[1,8,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[8,64,9,f32]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 64 + d1, d2), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | shape: [1 : i32, 8 : i32, 64 : i32, 9 : i32] | tensor<[1,8,64,9,f32]> | mapping_from: (d0, d1, d2, d3), mapping_to: (d0 * 512 + d1 * 64 + d2, d3), memory_config: (16, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 2048 : i32] | tensor<[920,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[920,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 2048 : i32] | tensor<[920,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 1 : i32, 256 : i32] | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | yes | 0.0 | 9.25 | +| ttnn.reshape | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 1 : i32, 256 : i32] | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 256 : i32] | tensor<[920,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 256 : i32] | tensor<[920,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 8 : i32, 32 : i32] | tensor<[920,8,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (230, 1, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 8 : i32, 32 : i32] | tensor<[920,8,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (230, 1, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 1 : i32, 2048 : i32] | tensor<[920,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[920,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 1 : i32, 2048 : i32] | tensor<[920,1,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 1 : i32, 256 : i32] | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[920,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 1 : i32, 256 : i32] | tensor<[920,1,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 + d1, d2), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (7360, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 256 : i32, 256 : i32] | tensor<[920,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (7360, 8, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[920,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (7360, 8, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 256 : i32, 256 : i32] | tensor<[920,256,256,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 256 + d1, d2), memory_config: (7360, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[920,8,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (230, 1, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 256 : i32] | tensor<[920,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[920,8,32,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 8 + d1, d2), memory_config: (230, 1, 'tile<32x32, bf16>', 'dram') | shape: [920 : i32, 256 : i32] | tensor<[920,256,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (29, 8, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 1024 : i32] | tensor<[1,9,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,1024,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 1024 : i32] | tensor<[1,9,1024,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 32, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 128 : i32] | tensor<[1,9,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,128,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 128 : i32] | tensor<[1,9,128,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 4, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [54 : i32, 2 : i32] | tensor<[54,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,12,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 1, 'tile<32x32, f32>', 'dram') | shape: [54 : i32, 2 : i32] | tensor<[54,2,f32]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (2, 1, 'tile<32x32, f32>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,16384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 16384 : i32] | tensor<[1,9,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[9,16384,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 16384 : i32] | tensor<[1,9,16384,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 512, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 2048 : i32] | tensor<[1,9,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,2048,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 2048 : i32] | tensor<[1,9,2048,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 64, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,30000,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 938, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 30000 : i32] | tensor<[1,9,30000,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 938, 'tile<32x32, bf16>', 'dram') | no | nan | nan | +| ttnn.reshape | tensor<[9,30000,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 938, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 30000 : i32] | tensor<[1,9,30000,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 938, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 3072 : i32] | tensor<[1,9,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,3072,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 3072 : i32] | tensor<[1,9,3072,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 96, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 4096 : i32] | tensor<[1,9,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,4096,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 4096 : i32] | tensor<[1,9,4096,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 128, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 768 : i32] | tensor<[1,9,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,768,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 768 : i32] | tensor<[1,9,768,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 24, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | +| ttnn.reshape | tensor<[9,8192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 256, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 8192 : i32] | tensor<[1,9,8192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 256, 'tile<32x32, bf16>', 'dram') | yes | 1.0 | 0.0 | +| ttnn.reshape | tensor<[9,8192,bf16]> | mapping_from: (d0, d1), mapping_to: (d0, d1), memory_config: (1, 256, 'tile<32x32, bf16>', 'dram') | shape: [1 : i32, 9 : i32, 8192 : i32] | tensor<[1,9,8192,bf16]> | mapping_from: (d0, d1, d2), mapping_to: (d0 * 9 + d1, d2), memory_config: (1, 256, 'tile<32x32, bf16>', 'dram') | N/A | nan | nan | diff --git a/tests/sweep_framework/sweeps/data_movement/view/view_tt_torch.py b/tests/sweep_framework/sweeps/data_movement/view/view_tt_torch.py new file mode 100644 index 000000000000..241c2fef5dde --- /dev/null +++ b/tests/sweep_framework/sweeps/data_movement/view/view_tt_torch.py @@ -0,0 +1,111 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional, Tuple + +import torch +import random +import ttnn + +from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time +from models.utility_functions import torch_random + +TIMEOUT = 10 +# seed for random +random.seed(0) + + +def extract_brackets_content(line): + # Function to extract the content inside brackets + brackets_content = [] + open_brackets = 0 + current_content = "" + + for char in line: + if char == "[": + open_brackets += 1 + if open_brackets > 0: + current_content = "" # Reset content inside the brackets + elif char == "]": + if open_brackets > 0: + brackets_content.append(current_content.strip()) + open_brackets -= 1 + elif open_brackets > 0: + current_content += char + + return brackets_content + + +def parse_md_file_simple_no_regex(file_path): + view_specs = [] + i = 0 + + with open(file_path, "r") as file: + for line in file.readlines(): + parsed_line = line.split("|") + if parsed_line[0] == " ttnn.reshape ": + tensor = parsed_line[2].split("[")[1].split("]")[0] + target = parsed_line[5].split("[")[1].split("]")[0] + tensor_shape = list(map(int, tensor.split(",")[:-1])) + target_shape = list(map(int, target.split(",")[:-1])) + addition = {"shape": tensor_shape, "size": target_shape} + view_specs.append(addition) + return view_specs + + +parameters = { + "nightly": { + "view_specs": parse_md_file_simple_no_regex( + "/home/jvega/work/reshape_host_merge/tt-metal/tests/sweep_framework/sweeps/data_movement/view/tt_torch_trace.md" + ), + "layout": [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT], + "dtype": [ttnn.bfloat16, ttnn.float32], + } +} + + +# Invalidate vector is called during the generation phase where each vector will be passed in. +# If invalidated, the vector will still be stored but will be skipped. +# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid. +def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]: + return False, None + + +def run( + view_specs, + layout, + dtype, + *, + device, +): + device.enable_async(False) + + # Extract the shape and new size (target shape) from view_specs + shape = view_specs["shape"] + size = view_specs["size"] # New shape for the view/reshape operation + + # Create a random tensor of the specified shape + tensor = torch_random(shape, -0.1, 0.1, dtype=torch.bfloat16) + + # Apply view using PyTorch's view function to reshape the tensor + torch_output_tensor = tensor.view(*size) + + # Convert the tensor to the ttnn tensor format + ttnn_tensor = ttnn.from_torch(tensor, device=device, layout=layout, dtype=dtype) + + # Measure performance of the reshape operation in ttnn + start_time = start_measuring_time() + + # Apply reshape in ttnn + ttnn_output_tensor = ttnn.reshape(ttnn_tensor, size) + + e2e_perf = stop_measuring_time(start_time) + + # Convert the ttnn tensor back to PyTorch for comparison + ttnn_output_tensor = ttnn.to_torch(ttnn_output_tensor) + + # Compare the results and return performance and accuracy check + result = check_with_pcc(torch_output_tensor, ttnn_output_tensor, 0.999) + + return [result, e2e_perf] diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp index eddb977d02b0..143349f71313 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp @@ -246,7 +246,7 @@ ttnn::Shape tiling_reshape_corrector(const ttnn::Shape& shape, const uint32_t ti const int8_t correction_1 =(tile_first_dim - (int)padded[-1] % tile_first_dim) % tile_first_dim; if(rank == 1) { - return ttnn::Shape({1,shape[0]},{32,padded[0]+correction_1}); + return ttnn::Shape({1, shape[0]}, {32, padded[0] + correction_1}); } const int8_t correction_2 =(tile_second_dim - (int)padded[-2] % tile_second_dim) % tile_second_dim; switch(rank) From 71cb582f2c2dbb672218419b0e03c58835139690 Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Wed, 11 Dec 2024 18:18:13 +0200 Subject: [PATCH 51/59] [skip ci] Update ttnn/tt-metalium docs references after separation (#15844) ### Ticket ### Problem description Small PR - Commit ac4a347549766c0632674763e4c12048f726e253 moved separated ttnn/tt-metalium docs. This PR adjusts sources to changes and fixes a few typos along the way. ### What's changed Describe the approach used to solve the problem. Summarize the changes made and its impact. ### Checklist - [x] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes Signed-off-by: Emmanuel Ferdman --- CODEOWNERS | 2 +- CONTRIBUTING.md | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index 3b74d00a0470..b683613df4f7 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -165,7 +165,7 @@ models/perf/benchmarking_utils.py @skhorasganiTT # docs docs/Makefile @tenstorrent/metalium-developers-infra -docs/source/ttnn/dependencies/tt_lib.rst @eyonland @patrickroberts @yan-zaretskiy @ayerofieiev-tt +docs/source/ttnn/ttnn/dependencies/tt_lib.rst @eyonland @patrickroberts @yan-zaretskiy @ayerofieiev-tt docs/source/ttnn/ @eyonland @patrickroberts @yan-zaretskiy @ayerofieiev-tt @razorback3 @dongjin-na # misc diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e6b3a690fc2f..ae1466f2aba4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -96,7 +96,7 @@ set up for users. Please refer to the [README](README.md) for source installation and environment setup instructions, then please read the [Getting Started -page](docs/source/get_started/get_started.rst). +page](docs/source/tt-metalium/get_started/get_started.rst). ### Setting logger level @@ -112,7 +112,7 @@ TT_METAL_LOGGER_LEVEL=Debug ./build/test/tt_metal/test_add_two_ints ### Building and viewing the documentation locally 1. First, ensure that you have [built the project and activated the Python -environment](docs/source/get_started/get_started.rst), along with any required +environment](docs/source/tt-metalium/get_started/get_started.rst), along with any required `PYTHONPATH` variables. 2. Build the HTML documentation. @@ -300,7 +300,7 @@ running such tests. - To debug the C++ binding file itself: - Ensure the python file you wish to debug is standalone and has a main function. - Run `gdb --args python ` - - Breakpoints can be added for future loaded libraries. For example, to add a breakpoint to `Device` object construtor: + - Breakpoints can be added for future loaded libraries. For example, to add a breakpoint to `Device` object constructor: ``` (gdb) b device.cpp:Device::Device No source file named device.cpp. @@ -353,7 +353,7 @@ TT_METAL_WATCHER=10 ./your_program - If no such error is reported, but the program is hanging, check the watcher log generated in `generated/watcher/watcher.log`. There is a legend at the top of the log showing how to interpret it, and a sample portion of a log is shown below: ``` Legend: - Comma separated list specifices waypoint for BRISC,NCRISC,TRISC0,TRISC1,TRISC2 + Comma separated list specifies waypoint for BRISC,NCRISC,TRISC0,TRISC1,TRISC2 I=initialization sequence W=wait (top of spin loop) R=run (entering kernel) @@ -528,7 +528,7 @@ To set up pre-commit on your local machine, follow these steps: on the link to [all post-commit workflows](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml), clicking "Run workflow", selecting your branch, and pressing "Run workflow". - ![Dropdown menu of all post-commit workflows and Run Workflow button](docs/source/_static/all-post-commit-workflows-button.png) + ![Dropdown menu of all post-commit workflows and Run Workflow button](docs/source/common/_static/all-post-commit-workflows-button.png) You can see the status of your CI run by clicking on the specific run you dispatched. From 9b367b9c204a3656a107e26f54802981b09b9f12 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Wed, 11 Dec 2024 22:01:23 +0530 Subject: [PATCH 52/59] Improve memory usage in test_bert_batch_dram.py (#15896) --- .../metal_BERT_large_11/tests/test_bert_batch_dram.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py b/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py index f9d6e46dc8cd..050e7b9dce44 100644 --- a/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py +++ b/models/demos/metal_BERT_large_11/tests/test_bert_batch_dram.py @@ -221,6 +221,13 @@ def run_bert_question_and_answering_inference( profiler.end("processing_output_to_string") del tt_out + del tt_embedding + del tt_attention_mask + del tt_embedding_inputs + del bert_input + del pytorch_out + if "single_inputs" in locals(): + del single_inputs profiler.print() From 79e0c681b4b6b62477374ffb4f91819d06dc403e Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Tue, 10 Dec 2024 19:02:19 +0000 Subject: [PATCH 53/59] #0: Update global cb, sem apis to take in sub_device_ids to know what to stall on when writing to device --- .../tt_metal/api/test_global_semaphores.cpp | 8 ++-- tt_metal/host_api.hpp | 38 ++++++++++------ .../impl/buffers/global_circular_buffer.cpp | 24 +++++++--- .../impl/buffers/global_circular_buffer.hpp | 10 +++-- tt_metal/impl/buffers/global_semaphore.cpp | 45 +++++++++++++------ tt_metal/impl/buffers/global_semaphore.hpp | 29 +++++++++--- .../tt_metal/global_circular_buffer.hpp | 5 ++- tt_metal/tt_metal.cpp | 21 ++++++--- ttnn/cpp/pybind11/global_circular_buffer.cpp | 26 +++++++++-- ttnn/cpp/pybind11/global_semaphore.cpp | 35 +++++++++++++-- ttnn/cpp/ttnn/global_circular_buffer.cpp | 14 +++--- ttnn/cpp/ttnn/global_circular_buffer.hpp | 6 ++- ttnn/cpp/ttnn/global_semaphore.cpp | 33 +++++++++----- ttnn/cpp/ttnn/global_semaphore.hpp | 15 +++++-- 14 files changed, 225 insertions(+), 84 deletions(-) diff --git a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp index 13d38f4e2ef5..f88e71efda9b 100644 --- a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp +++ b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp @@ -21,7 +21,7 @@ TEST_F(DispatchFixture, InitializeGlobalSemaphores) { uint32_t initial_value = 1; auto global_semaphore = tt::tt_metal::CreateGlobalSemaphore(device, cores, initial_value); auto address = global_semaphore->address(); - + Synchronize(device); for (const auto& core : cores_vec) { auto sem_vals = tt::llrt::read_hex_vec_from_core( device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t)); @@ -33,7 +33,7 @@ TEST_F(DispatchFixture, InitializeGlobalSemaphores) { uint32_t initial_value = 2; auto global_semaphore = tt::tt_metal::CreateGlobalSemaphore(device, cores, initial_value); auto address = global_semaphore->address(); - + Synchronize(device); for (const auto& core : cores_vec) { auto sem_vals = tt::llrt::read_hex_vec_from_core( device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t)); @@ -61,6 +61,7 @@ TEST_F(DispatchFixture, CreateMultipleGlobalSemaphoresOnSameCore) { global_semaphores.push_back(tt::tt_metal::CreateGlobalSemaphore(device, cores[i], initial_values[i])); addresses.push_back(global_semaphores[i]->address()); } + Synchronize(device); for (size_t i = 0; i < cores.size(); i++) { const auto& address = addresses[i]; const auto& initial_value = initial_values[i]; @@ -88,7 +89,7 @@ TEST_F(DispatchFixture, ResetGlobalSemaphores) { std::vector overwrite_value = {2}; auto global_semaphore = tt::tt_metal::CreateGlobalSemaphore(device, cores, initial_value); auto address = global_semaphore->address(); - + Synchronize(device); for (const auto& core : cores_vec) { auto sem_vals = tt::llrt::read_hex_vec_from_core( device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t)); @@ -104,6 +105,7 @@ TEST_F(DispatchFixture, ResetGlobalSemaphores) { EXPECT_EQ(sem_vals[0], overwrite_value[0]); } global_semaphore->reset_semaphore_value(); + Synchronize(device); for (const auto& core : cores_vec) { auto sem_vals = tt::llrt::read_hex_vec_from_core( device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t)); diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp index be3b0e5fcadd..8a9dfcb20033 100644 --- a/tt_metal/host_api.hpp +++ b/tt_metal/host_api.hpp @@ -299,16 +299,21 @@ uint32_t CreateSemaphore( * * Return value: std::shared_ptr * - * | Argument | Description | Type | Valid Range | Required | - * |---------------|------------------------------------------------------|-----------------------------------------------------------|--------------|----------| - * | device | The device to create the semaphore on | Device * | | Yes | - * | cores | Range of the Tensix co-ordinates using the semaphore | const CoreRangeSet & | | Yes | - * | initial_value | Initial value of the semaphore | uint32_t | | Yes | - * | buffer_type | Buffer type to store the semaphore | BufferType | L1 types | No | + * | Argument | Description | Type | Valid Range | Required | + * |----------------|--------------------------------------------------------|-----------------------------------------------------------|--------------|----------| + * | device | The device to create the semaphore on | Device * | | Yes | + * | cores | Range of the Tensix co-ordinates using the semaphore | const CoreRangeSet & | | Yes | + * | initial_value | Initial value of the semaphore | uint32_t | | Yes | + * | buffer_type | Buffer type to store the semaphore | BufferType | L1 types | No | + * | sub_device_ids | Sub-device ids to wait on before writing the semaphore | tt::stl::Span | | No | */ // clang-format on std::shared_ptr CreateGlobalSemaphore( - Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1); + Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); // clang-format off /** @@ -317,16 +322,21 @@ std::shared_ptr CreateGlobalSemaphore( * * Return value: std::shared_ptr * - * | Argument | Description | Type | Valid Range | Required | - * |---------------|------------------------------------------------------|-----------------------------------------------------------|--------------|----------| - * | device | The device to create the semaphore on | Device * | | Yes | - * | cores | Range of the Tensix co-ordinates using the semaphore | CoreRangeSet && | | Yes | - * | initial_value | Initial value of the semaphore | uint32_t | | Yes | - * | buffer_type | Buffer type to store the semaphore | BufferType | L1 types | No | + * | Argument | Description | Type | Valid Range | Required | + * |----------------|--------------------------------------------------------|-----------------------------------------------------------|--------------|----------| + * | device | The device to create the semaphore on | Device * | | Yes | + * | cores | Range of the Tensix co-ordinates using the semaphore | CoreRangeSet && | | Yes | + * | initial_value | Initial value of the semaphore | uint32_t | | Yes | + * | buffer_type | Buffer type to store the semaphore | BufferType | L1 types | No | + * | sub_device_ids | Sub-device ids to wait on before writing the semaphore | tt::stl::Span | | No | */ // clang-format on std::shared_ptr CreateGlobalSemaphore( - Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1); + Device* device, + CoreRangeSet&& cores, + uint32_t initial_value, + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); // clang-format off /** diff --git a/tt_metal/impl/buffers/global_circular_buffer.cpp b/tt_metal/impl/buffers/global_circular_buffer.cpp index df8df656ac36..094670b2a301 100644 --- a/tt_metal/impl/buffers/global_circular_buffer.cpp +++ b/tt_metal/impl/buffers/global_circular_buffer.cpp @@ -27,7 +27,8 @@ GlobalCircularBuffer::GlobalCircularBuffer( Device* device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type) : + BufferType buffer_type, + tt::stl::Span sub_device_ids) : device_(device), sender_receiver_core_mapping_(sender_receiver_core_mapping), size_(size) { TT_FATAL(this->device_ != nullptr, "Device cannot be null"); uint32_t num_sender_cores = sender_receiver_core_mapping.size(); @@ -46,10 +47,11 @@ GlobalCircularBuffer::GlobalCircularBuffer( TT_FATAL(num_receiver_cores == this->receiver_cores_.num_cores(), "Duplicate receiver cores found"); this->all_cores_ = this->sender_cores_.merge(this->receiver_cores_); TT_FATAL(this->all_cores_.num_cores() == num_sender_cores + num_receiver_cores, "Duplicate cores found"); - this->setup_cb_buffers(buffer_type, max_num_receivers_per_sender); + this->setup_cb_buffers(buffer_type, max_num_receivers_per_sender, sub_device_ids); } -void GlobalCircularBuffer::setup_cb_buffers(BufferType buffer_type, uint32_t max_num_receivers_per_sender) { +void GlobalCircularBuffer::setup_cb_buffers( + BufferType buffer_type, uint32_t max_num_receivers_per_sender, tt::stl::Span sub_device_ids) { TT_FATAL( buffer_type == BufferType::L1 or buffer_type == BufferType::L1_SMALL, "Global circular buffer can only be created for L1 buffer types"); @@ -123,12 +125,18 @@ void GlobalCircularBuffer::setup_cb_buffers(BufferType buffer_type, uint32_t max } } - // Blocking write of cb config to buffer + // Write the config buffer to the device + // Only block for the slow dispatch case if (this->device_->using_slow_dispatch()) { detail::WriteToBuffer(*this->cb_config_buffer_, cb_config_host_buffer); tt::Cluster::instance().l1_barrier(this->device_->id()); } else { - EnqueueWriteBuffer(this->device_->command_queue(), this->cb_config_buffer_, cb_config_host_buffer.data(), true); + EnqueueWriteBuffer( + this->device_->command_queue(), + this->cb_config_buffer_, + cb_config_host_buffer.data(), + false, + sub_device_ids); } } @@ -136,8 +144,10 @@ std::shared_ptr GlobalCircularBuffer::create( Device* device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type) { - return std::make_unique(device, sender_receiver_core_mapping, size, buffer_type); + BufferType buffer_type, + tt::stl::Span sub_device_ids) { + return std::make_shared( + device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids); } const Buffer& GlobalCircularBuffer::cb_buffer() const { return *this->cb_buffer_; } diff --git a/tt_metal/impl/buffers/global_circular_buffer.hpp b/tt_metal/impl/buffers/global_circular_buffer.hpp index d18ed91e0c47..ca0c56da71f2 100644 --- a/tt_metal/impl/buffers/global_circular_buffer.hpp +++ b/tt_metal/impl/buffers/global_circular_buffer.hpp @@ -9,6 +9,7 @@ #include "tt_metal/common/core_coord.hpp" #include "tt_metal/impl/buffers/buffer_constants.hpp" +#include "tt_metal/impl/sub_device/sub_device_types.hpp" #include "tt_metal/llrt/hal.hpp" namespace tt::tt_metal { @@ -30,7 +31,8 @@ class GlobalCircularBuffer { Device* device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type); + BufferType buffer_type, + tt::stl::Span sub_device_ids); GlobalCircularBuffer(const GlobalCircularBuffer&) = default; GlobalCircularBuffer& operator=(const GlobalCircularBuffer&) = default; @@ -42,7 +44,8 @@ class GlobalCircularBuffer { Device* device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type = BufferType::L1); + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); const Buffer& cb_buffer() const; @@ -57,7 +60,8 @@ class GlobalCircularBuffer { const auto attribute_values() const { return std::make_tuple(this->sender_receiver_core_mapping_, this->size_); } private: - void setup_cb_buffers(BufferType buffer_type, uint32_t max_num_receivers_per_sender); + void setup_cb_buffers( + BufferType buffer_type, uint32_t max_num_receivers_per_sender, tt::stl::Span sub_device_ids); // GlobalCircularBuffer is implemented as a wrapper around a sharded buffer // This can be updated in the future to be its own container with optimized dispatch functions diff --git a/tt_metal/impl/buffers/global_semaphore.cpp b/tt_metal/impl/buffers/global_semaphore.cpp index f080ab23b065..57ef080d0f74 100644 --- a/tt_metal/impl/buffers/global_semaphore.cpp +++ b/tt_metal/impl/buffers/global_semaphore.cpp @@ -20,17 +20,26 @@ namespace tt::tt_metal { GlobalSemaphore::GlobalSemaphore( - Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type) : + Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type, + tt::stl::Span sub_device_ids) : device_(device), cores_(cores), initial_value_(initial_value) { - this->setup_buffer(buffer_type); + this->setup_buffer(buffer_type, sub_device_ids); } -GlobalSemaphore::GlobalSemaphore(Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type) : +GlobalSemaphore::GlobalSemaphore( + Device* device, + CoreRangeSet&& cores, + uint32_t initial_value, + BufferType buffer_type, + tt::stl::Span sub_device_ids) : device_(device), cores_(std::move(cores)), initial_value_(initial_value) { - this->setup_buffer(buffer_type); + this->setup_buffer(buffer_type, sub_device_ids); } -void GlobalSemaphore::setup_buffer(BufferType buffer_type) { +void GlobalSemaphore::setup_buffer(BufferType buffer_type, tt::stl::Span sub_device_ids) { TT_FATAL( buffer_type == BufferType::L1 or buffer_type == BufferType::L1_SMALL, "Global semaphore can only be created for L1 buffer types"); @@ -50,29 +59,39 @@ void GlobalSemaphore::setup_buffer(BufferType buffer_type) { std::nullopt); this->host_buffer_ = std::vector(num_cores, this->initial_value_); - this->reset_semaphore_value(); + this->reset_semaphore_value(sub_device_ids); } std::shared_ptr GlobalSemaphore::create( - Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type) { - return std::make_unique(device, cores, initial_value, buffer_type); + Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type, + tt::stl::Span sub_device_ids) { + return std::make_shared(device, cores, initial_value, buffer_type, sub_device_ids); } std::shared_ptr GlobalSemaphore::create( - Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type) { - return std::make_unique(device, std::move(cores), initial_value, buffer_type); + Device* device, + CoreRangeSet&& cores, + uint32_t initial_value, + BufferType buffer_type, + tt::stl::Span sub_device_ids) { + return std::make_shared(device, std::move(cores), initial_value, buffer_type, sub_device_ids); } Device* GlobalSemaphore::device() const { return device_; } DeviceAddr GlobalSemaphore::address() const { return buffer_->address(); } -void GlobalSemaphore::reset_semaphore_value() { - // Blocking write of semaphore value to buffer +void GlobalSemaphore::reset_semaphore_value(tt::stl::Span sub_device_ids) { + // Write the initial value to the semaphore to the device + // Only block for the slow dispatch case if (this->device_->using_slow_dispatch()) { detail::WriteToBuffer(*this->buffer_, this->host_buffer_); tt::Cluster::instance().l1_barrier(this->device_->id()); } else { - EnqueueWriteBuffer(this->device_->command_queue(), this->buffer_, this->host_buffer_.data(), true); + EnqueueWriteBuffer( + this->device_->command_queue(), this->buffer_, this->host_buffer_.data(), false, sub_device_ids); } } diff --git a/tt_metal/impl/buffers/global_semaphore.hpp b/tt_metal/impl/buffers/global_semaphore.hpp index f6d657998f8e..0d912b2f9ac6 100644 --- a/tt_metal/impl/buffers/global_semaphore.hpp +++ b/tt_metal/impl/buffers/global_semaphore.hpp @@ -9,6 +9,7 @@ #include "tt_metal/common/core_coord.hpp" #include "tt_metal/impl/buffers/buffer_constants.hpp" +#include "tt_metal/impl/sub_device/sub_device_types.hpp" #include "tt_metal/llrt/hal.hpp" namespace tt::tt_metal { @@ -21,10 +22,18 @@ class Device; class GlobalSemaphore { public: GlobalSemaphore( - Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1); + Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); GlobalSemaphore( - Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1); + Device* device, + CoreRangeSet&& cores, + uint32_t initial_value, + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); GlobalSemaphore(const GlobalSemaphore&) = default; GlobalSemaphore& operator=(const GlobalSemaphore&) = default; @@ -33,22 +42,30 @@ class GlobalSemaphore { GlobalSemaphore& operator=(GlobalSemaphore&&) noexcept = default; static std::shared_ptr create( - Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1); + Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); static std::shared_ptr create( - Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1); + Device* device, + CoreRangeSet&& cores, + uint32_t initial_value, + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); Device* device() const; DeviceAddr address() const; - void reset_semaphore_value(); + void reset_semaphore_value(tt::stl::Span sub_device_ids = {}); static constexpr auto attribute_names = std::forward_as_tuple("cores", "initial_value"); const auto attribute_values() const { return std::make_tuple(this->cores_, this->initial_value_); } private: - void setup_buffer(BufferType buffer_type); + void setup_buffer(BufferType buffer_type, tt::stl::Span sub_device_ids); // GlobalSemaphore is implemented as a wrapper around a sharded buffer // This can be updated in the future to be its own container with optimized dispatch functions diff --git a/tt_metal/include/tt_metal/global_circular_buffer.hpp b/tt_metal/include/tt_metal/global_circular_buffer.hpp index 776296a589a4..3c19ee7a07bf 100644 --- a/tt_metal/include/tt_metal/global_circular_buffer.hpp +++ b/tt_metal/include/tt_metal/global_circular_buffer.hpp @@ -22,13 +22,16 @@ namespace experimental { * @param sender_receiver_core_mapping The mapping of remote sender to remote receiver cores for the circular buffer. * @param size Size of the global circular buffer per core in bytes. * @param buffer_type Buffer type to store the global circular buffer. Can only be an L1 buffer type. + * @param sub_device_ids Sub-device IDs to wait on before writing the global circular buffer config to device. Defaults + * to waiting on all sub-devices. * @return Handle to the allocated global circular buffer. */ std::shared_ptr CreateGlobalCircularBuffer( Device* device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type = BufferType::L1); + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); } // namespace experimental diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index e55dc27384fe..b5402a3ac05a 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -1175,13 +1175,21 @@ uint32_t CreateSemaphore( } std::shared_ptr CreateGlobalSemaphore( - Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type) { - return GlobalSemaphore::create(device, cores, initial_value, buffer_type); + Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type, + tt::stl::Span sub_device_ids) { + return GlobalSemaphore::create(device, cores, initial_value, buffer_type, sub_device_ids); } std::shared_ptr CreateGlobalSemaphore( - Device* device, CoreRangeSet&& cores, uint32_t initial_value, BufferType buffer_type) { - return GlobalSemaphore::create(device, std::move(cores), initial_value, buffer_type); + Device* device, + CoreRangeSet&& cores, + uint32_t initial_value, + BufferType buffer_type, + tt::stl::Span sub_device_ids) { + return GlobalSemaphore::create(device, std::move(cores), initial_value, buffer_type, sub_device_ids); } std::shared_ptr CreateBuffer(const InterleavedBufferConfig& config) { @@ -1384,8 +1392,9 @@ std::shared_ptr CreateGlobalCircularBuffer( Device* device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type) { - return GlobalCircularBuffer::create(device, sender_receiver_core_mapping, size, buffer_type); + BufferType buffer_type, + tt::stl::Span sub_device_ids) { + return GlobalCircularBuffer::create(device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids); } CBHandle CreateCircularBuffer( diff --git a/ttnn/cpp/pybind11/global_circular_buffer.cpp b/ttnn/cpp/pybind11/global_circular_buffer.cpp index f736ee99781a..4c21941b73c2 100644 --- a/ttnn/cpp/pybind11/global_circular_buffer.cpp +++ b/ttnn/cpp/pybind11/global_circular_buffer.cpp @@ -19,12 +19,19 @@ void py_module(py::module& module) { // Single Device APIs module.def( "create_global_circular_buffer", - py::overload_cast&, uint32_t, BufferType>( - &create_global_circular_buffer), + [](Device* device, + const std::unordered_map& sender_receiver_core_mapping, + uint32_t size, + BufferType buffer_type, + const std::vector& sub_device_ids) { + return ttnn::global_circular_buffer::create_global_circular_buffer( + device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids); + }, py::arg("device"), py::arg("sender_receiver_core_mapping"), py::arg("size"), py::arg("buffer_type") = tt::tt_metal::BufferType::L1, + py::arg("sub_device_ids") = std::vector(), R"doc( Create a GlobalCircularBuffer Object on a single device. @@ -33,17 +40,26 @@ void py_module(py::module& module) { sender_receiver_core_mapping (dict): The mapping of remote sender to remote receiver cores for the circular buffer. size (int): Size of the global circular buffer per core in bytes. buffer_type (BufferType): The type of buffer to use for the global circular buffer. + sub_device_ids (List[ttnn.SubDeviceIds]): Sub-device IDs to wait on before writing the global circular buffer config to device. + Defaults to waiting on all sub-devices. )doc"); // Multi Device APIs module.def( "create_global_circular_buffer", - py::overload_cast&, uint32_t, BufferType>( - &create_global_circular_buffer), + [](MeshDevice* mesh_device, + const std::unordered_map& sender_receiver_core_mapping, + uint32_t size, + BufferType buffer_type, + const std::vector& sub_device_ids) { + return ttnn::global_circular_buffer::create_global_circular_buffer( + mesh_device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids); + }, py::arg("mesh_device"), py::arg("sender_receiver_core_mapping"), py::arg("size"), py::arg("buffer_type") = tt::tt_metal::BufferType::L1, + py::arg("sub_device_ids") = std::vector(), R"doc( Create a GlobalCircularBuffer Object on a single device. @@ -52,6 +68,8 @@ void py_module(py::module& module) { sender_receiver_core_mapping (dict): The mapping of remote sender to remote receiver cores for the circular buffer. size (int): Size of the global circular buffer per core in bytes. buffer_type (BufferType): The type of buffer to use for the global circular buffer. + sub_device_ids (List[ttnn.SubDeviceIds]): Sub-device IDs to wait on before writing the global circular buffer config to device. + Defaults to waiting on all sub-devices. )doc"); } diff --git a/ttnn/cpp/pybind11/global_semaphore.cpp b/ttnn/cpp/pybind11/global_semaphore.cpp index 79a97de58df3..f6e44cb34191 100644 --- a/ttnn/cpp/pybind11/global_semaphore.cpp +++ b/ttnn/cpp/pybind11/global_semaphore.cpp @@ -19,11 +19,19 @@ void py_module(py::module& module) { // Single Device APIs module.def( "create_global_semaphore", - py::overload_cast(&create_global_semaphore), + [](Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type, + const std::vector& sub_device_ids) { + return ttnn::global_semaphore::create_global_semaphore( + device, cores, initial_value, buffer_type, sub_device_ids); + }, py::arg("device"), py::arg("cores"), py::arg("initial_value"), py::arg("buffer_type") = tt::tt_metal::BufferType::L1, + py::arg("sub_device_ids") = std::vector(), R"doc( Create a GlobalSemaphore Object on a single device. @@ -32,6 +40,8 @@ void py_module(py::module& module) { cores (CoreRangeSet): The cores on which the global semaphore will be used for synchronization. initial_value (int): The initial value of the global semaphore. buffer_type (BufferType): The type of buffer to use for the global semaphore. + sub_device_ids (List[ttnn.SubDeviceIds]): Sub-device IDs to wait on before writing the global semaphore value to device. + Defaults to waiting on all sub-devices. )doc"); module.def( @@ -47,23 +57,35 @@ void py_module(py::module& module) { module.def( "reset_global_semaphore_value", - py::overload_cast&>(&reset_global_semaphore_value), + py::overload_cast&, const std::vector&>( + &reset_global_semaphore_value), py::arg("global_semaphore"), + py::arg("sub_device_ids") = std::vector(), R"doc( Reset the value of the global semaphore. Args: global_semaphore (GlobalSemaphore): The global semaphore object. + sub_device_ids (List[ttnn.SubDeviceIds]): Sub-device IDs to wait on before writing the global semaphore value to device. + Defaults to waiting on all sub-devices. )doc"); // Multi Device APIs module.def( "create_global_semaphore", - py::overload_cast(&create_global_semaphore), + [](MeshDevice* mesh_device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type, + const std::vector& sub_device_ids) { + return ttnn::global_semaphore::create_global_semaphore( + mesh_device, cores, initial_value, buffer_type, sub_device_ids); + }, py::arg("mesh_device"), py::arg("cores"), py::arg("initial_value"), py::arg("buffer_type") = tt::tt_metal::BufferType::L1, + py::arg("sub_device_ids") = std::vector(), R"doc( Create a GlobalSemaphore Object on a single device. @@ -72,6 +94,8 @@ void py_module(py::module& module) { cores (CoreRangeSet): The cores on which the global semaphore will be used for synchronization. initial_value (int): The initial value of the global semaphore. buffer_type (BufferType): The type of buffer to use for the global semaphore. + sub_device_ids (List[ttnn.SubDeviceIds]): Sub-device IDs to wait on before writing the global semaphore value to device. + Defaults to waiting on all sub-devices. )doc"); module.def( @@ -87,13 +111,16 @@ void py_module(py::module& module) { module.def( "reset_global_semaphore_value", - py::overload_cast(&reset_global_semaphore_value), + py::overload_cast&>( + &reset_global_semaphore_value), py::arg("global_semaphore"), + py::arg("sub_device_ids") = std::vector(), R"doc( Reset the value of the global semaphore. Args: global_semaphore (GlobalSemaphore): The global semaphore object. + sub_device_ids (List[ttnn.SubDeviceIds]): Sub-device IDs to wait on before writing the global semaphore value to device. )doc"); } diff --git a/ttnn/cpp/ttnn/global_circular_buffer.cpp b/ttnn/cpp/ttnn/global_circular_buffer.cpp index 7c5967fa3c2c..76cc9df3e9f5 100644 --- a/ttnn/cpp/ttnn/global_circular_buffer.cpp +++ b/ttnn/cpp/ttnn/global_circular_buffer.cpp @@ -21,12 +21,13 @@ std::shared_ptr create_global_circular_buffer( Device* device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type) { + BufferType buffer_type, + tt::stl::Span sub_device_ids) { std::shared_ptr global_cb; device->push_work( - [device, &sender_receiver_core_mapping, size, buffer_type, &global_cb]() { + [device, &sender_receiver_core_mapping, size, buffer_type, sub_device_ids, &global_cb]() { global_cb = tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer( - device, sender_receiver_core_mapping, size, buffer_type); + device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids); }, /*blocking=*/true); return global_cb; @@ -36,15 +37,16 @@ MultiDeviceGlobalCircularBuffer create_global_circular_buffer( MeshDevice* mesh_device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type) { + BufferType buffer_type, + tt::stl::Span sub_device_ids) { MultiDeviceGlobalCircularBuffer multi_device_global_cb(mesh_device); const auto& devices = mesh_device->get_devices(); for (uint32_t i = 0; i < devices.size(); ++i) { auto* device = devices[i]; auto& global_cb = multi_device_global_cb.global_circular_buffers[i]; - device->push_work([device, &sender_receiver_core_mapping, size, buffer_type, &global_cb]() { + device->push_work([device, &sender_receiver_core_mapping, size, buffer_type, sub_device_ids, &global_cb]() { global_cb = tt::tt_metal::v1::experimental::CreateGlobalCircularBuffer( - device, sender_receiver_core_mapping, size, buffer_type); + device, sender_receiver_core_mapping, size, buffer_type, sub_device_ids); }); } for (auto* device : devices) { diff --git a/ttnn/cpp/ttnn/global_circular_buffer.hpp b/ttnn/cpp/ttnn/global_circular_buffer.hpp index bb84ce3a7ab3..39f18a0a63de 100644 --- a/ttnn/cpp/ttnn/global_circular_buffer.hpp +++ b/ttnn/cpp/ttnn/global_circular_buffer.hpp @@ -20,13 +20,15 @@ std::shared_ptr create_global_circular_buffer( Device* device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type = BufferType::L1); + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); // Multi Device APIs MultiDeviceGlobalCircularBuffer create_global_circular_buffer( MeshDevice* mesh_device, const std::unordered_map& sender_receiver_core_mapping, uint32_t size, - BufferType buffer_type = BufferType::L1); + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); } // namespace ttnn::global_circular_buffer diff --git a/ttnn/cpp/ttnn/global_semaphore.cpp b/ttnn/cpp/ttnn/global_semaphore.cpp index da1ebf8f0f02..a74a4b350ccb 100644 --- a/ttnn/cpp/ttnn/global_semaphore.cpp +++ b/ttnn/cpp/ttnn/global_semaphore.cpp @@ -5,8 +5,9 @@ #include "global_semaphore.hpp" #include -#include "tt_metal/impl/buffers/global_semaphore.hpp" #include "tt_metal/host_api.hpp" +#include "tt_metal/impl/buffers/global_semaphore.hpp" +#include "tt_metal/tt_stl/span.hpp" namespace ttnn::global_semaphore { @@ -18,11 +19,15 @@ MultiDeviceGlobalSemaphore::MultiDeviceGlobalSemaphore(MeshDevice* mesh_device) } std::shared_ptr create_global_semaphore( - Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type) { + Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type, + tt::stl::Span sub_device_ids) { std::shared_ptr global_semaphore = nullptr; device->push_work( - [device, &cores, initial_value, buffer_type, &global_semaphore] { - global_semaphore = GlobalSemaphore::create(device, cores, initial_value, buffer_type); + [device, &cores, initial_value, buffer_type, sub_device_ids, &global_semaphore] { + global_semaphore = GlobalSemaphore::create(device, cores, initial_value, buffer_type, sub_device_ids); }, /*blocking=*/true); return global_semaphore; @@ -35,20 +40,25 @@ DeviceAddr get_global_semaphore_address(const std::shared_ptr& return address; } -void reset_global_semaphore_value(const std::shared_ptr& global_semaphore) { +void reset_global_semaphore_value( + const std::shared_ptr& global_semaphore, const std::vector& sub_device_ids) { auto* device = global_semaphore->device(); - device->push_work([global_semaphore] { global_semaphore->reset_semaphore_value(); }); + device->push_work([global_semaphore, sub_device_ids] { global_semaphore->reset_semaphore_value(sub_device_ids); }); } MultiDeviceGlobalSemaphore create_global_semaphore( - MeshDevice* mesh_device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type) { + MeshDevice* mesh_device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type, + tt::stl::Span sub_device_ids) { MultiDeviceGlobalSemaphore multi_device_global_semaphore(mesh_device); const auto& devices = mesh_device->get_devices(); for (uint32_t i = 0; i < devices.size(); ++i) { auto* device = devices[i]; auto& global_semaphore = multi_device_global_semaphore.global_semaphores[i]; - device->push_work([device, &cores, initial_value, buffer_type, &global_semaphore] { - global_semaphore = GlobalSemaphore::create(device, cores, initial_value, buffer_type); + device->push_work([device, &cores, initial_value, buffer_type, sub_device_ids, &global_semaphore] { + global_semaphore = GlobalSemaphore::create(device, cores, initial_value, buffer_type, sub_device_ids); }); } for (auto device : devices) { @@ -71,9 +81,10 @@ std::vector get_global_semaphore_address(const MultiDeviceGlobalSema return addresses; } -void reset_global_semaphore_value(const MultiDeviceGlobalSemaphore& global_semaphore) { +void reset_global_semaphore_value( + const MultiDeviceGlobalSemaphore& global_semaphore, const std::vector& sub_device_ids) { for (const auto& global_semaphore : global_semaphore.global_semaphores) { - reset_global_semaphore_value(global_semaphore); + reset_global_semaphore_value(global_semaphore, sub_device_ids); } } diff --git a/ttnn/cpp/ttnn/global_semaphore.hpp b/ttnn/cpp/ttnn/global_semaphore.hpp index 70f56fb4b4cb..b04cda2dd274 100644 --- a/ttnn/cpp/ttnn/global_semaphore.hpp +++ b/ttnn/cpp/ttnn/global_semaphore.hpp @@ -17,17 +17,24 @@ struct MultiDeviceGlobalSemaphore { // Single Device APIs std::shared_ptr create_global_semaphore( - Device* device, const CoreRangeSet& cores, uint32_t initial_value, BufferType buffer_type = BufferType::L1); + Device* device, + const CoreRangeSet& cores, + uint32_t initial_value, + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); DeviceAddr get_global_semaphore_address(const std::shared_ptr& global_semaphore); -void reset_global_semaphore_value(const std::shared_ptr& global_semaphore); +void reset_global_semaphore_value( + const std::shared_ptr& global_semaphore, const std::vector& sub_device_ids = {}); // Multi Device APIs MultiDeviceGlobalSemaphore create_global_semaphore( MeshDevice* mesh_device, const CoreRangeSet& cores, uint32_t initial_value, - BufferType buffer_type = BufferType::L1); + BufferType buffer_type = BufferType::L1, + tt::stl::Span sub_device_ids = {}); std::vector get_global_semaphore_address(const MultiDeviceGlobalSemaphore& global_semaphore); -void reset_global_semaphore_value(const MultiDeviceGlobalSemaphore& global_semaphore); +void reset_global_semaphore_value( + const MultiDeviceGlobalSemaphore& global_semaphore, const std::vector& sub_device_ids = {}); } // namespace ttnn::global_semaphore From fc04d9d546c46c3fa840fdeb03900bf8ce4f5a2d Mon Sep 17 00:00:00 2001 From: Oleg Milyutin Date: Wed, 11 Dec 2024 12:04:45 -0500 Subject: [PATCH 54/59] #0: Re-order reshape and device upload in test_tilize_zero_padding_channels_last.cpp (#15913) ### Problem description [PR](https://github.com/tenstorrent/tt-metal/pull/15671/files#diff-4f512870412ccb31b911cf600798f5b502963f64bf93e04f39e13fe7eb9b8737) possibly introduced a failure in the test. ### What's changed Re-shape now happens on the host, before uploading a tensor on device. ### Checklist - [X] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12279520776) - [X] Manual test on a grayskull machine - reproduced the issue and confirmed the fix resolves it. --- .../tt_eager/ops/test_tilize_zero_padding_channels_last.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp b/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp index a94093f8ac6c..e565c4d80269 100644 --- a/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp +++ b/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp @@ -37,8 +37,9 @@ int main(int argc, char** argv) { //////////////////////////////////////////////////////////////////////////// ttnn::SimpleShape shape{1, 32, 61, 32}; // Allocates a DRAM buffer on device populated with values specified by initialize - Tensor a = ttnn::arange(/*start=*/0, /*stop=*/shape.volume(), /*step=*/1, DataType::BFLOAT16, std::ref(*device)) - .reshape(shape); + Tensor a = ttnn::arange(/*start=*/0, /*stop=*/shape.volume(), /*step=*/1, DataType::BFLOAT16) + .reshape(shape) + .to(device); Tensor b = ttnn::tilize_with_zero_padding(a); Tensor c = b.cpu(); //////////////////////////////////////////////////////////////////////////// From 7a51bb70e522ed900df513a3bf104730c4df89b8 Mon Sep 17 00:00:00 2001 From: Raymond Kim Date: Wed, 11 Dec 2024 17:06:51 +0000 Subject: [PATCH 55/59] #0: [skip ci] Go back to smaller set of SD tests because we're not ready for the others (in fact, we re-enabled a crazy set --- .../fast-dispatch-full-regressions-and-models-impl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml index 0af646345b18..ea49525bedf5 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml @@ -150,7 +150,7 @@ jobs: matrix: test-config: - model: "stable_diffusion" - cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion + cmd: ./tests/scripts/single_card/nightly/run_wh_b0_unstable.sh - model: "mamba 1" cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1 - model: "mamba 2" From 638a9ff6d0c055e11ffef7cb6f2d96f199a11eed Mon Sep 17 00:00:00 2001 From: Raymond Kim Date: Wed, 11 Dec 2024 17:13:03 +0000 Subject: [PATCH 56/59] #0: [skip ci] Revert "#0: [skip ci] Go back to smaller set of SD tests because we're not ready for the others (in fact, we re-enabled a crazy set" I was too hasty. This reverts commit 7a51bb70e522ed900df513a3bf104730c4df89b8. --- .../fast-dispatch-full-regressions-and-models-impl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml index ea49525bedf5..0af646345b18 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml @@ -150,7 +150,7 @@ jobs: matrix: test-config: - model: "stable_diffusion" - cmd: ./tests/scripts/single_card/nightly/run_wh_b0_unstable.sh + cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion - model: "mamba 1" cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1 - model: "mamba 2" From 142379a0e28f2daba5e8f05df3936bc8a565de89 Mon Sep 17 00:00:00 2001 From: Salar Hosseini Date: Wed, 11 Dec 2024 17:09:09 +0000 Subject: [PATCH 57/59] [Llama70b] Fix minor bug with assertion for valid prompt lengths Signed-off-by: Salar Hosseini --- models/demos/t3000/llama2_70b/tt/generator_vllm.py | 2 +- models/demos/t3000/llama2_70b/tt/llama_model_optimized.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/models/demos/t3000/llama2_70b/tt/generator_vllm.py b/models/demos/t3000/llama2_70b/tt/generator_vllm.py index a65850844c8f..9538ace45b14 100644 --- a/models/demos/t3000/llama2_70b/tt/generator_vllm.py +++ b/models/demos/t3000/llama2_70b/tt/generator_vllm.py @@ -20,7 +20,7 @@ def input_processor_for_llama70b(ctx: InputContext, inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs]): prompt_len = len(inputs.get("prompt_token_ids")) - if prompt_len >= 32768: + if prompt_len > 32768: raise ValueError( f"TT LLama70b does not yet support prompts longer than 32768 tokens (received prompt with {prompt_len} tokens)" ) diff --git a/models/demos/t3000/llama2_70b/tt/llama_model_optimized.py b/models/demos/t3000/llama2_70b/tt/llama_model_optimized.py index e81f3fc881d4..c821a40173fd 100644 --- a/models/demos/t3000/llama2_70b/tt/llama_model_optimized.py +++ b/models/demos/t3000/llama2_70b/tt/llama_model_optimized.py @@ -178,8 +178,8 @@ def validate_input_shape(self, inp_ids, mode): if mode == "prefill": assert ( - seq_len < self.model_config["MAX_PREFILL_SEQ_LEN"] - ), f"Prefill only supports seq_len < {self.model_config['MAX_PREFILL_SEQ_LEN']}" + seq_len <= self.model_config["MAX_PREFILL_SEQ_LEN"] + ), f"Prefill only supports seq_len <= {self.model_config['MAX_PREFILL_SEQ_LEN']}" def prepare_inputs(self, inp_ids, start_pos, valid_seq_len=None, mode="decode", page_table=None): """ From 29c8ea06a7a1c04bcced0429acadc1a2e7c56481 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Wed, 11 Dec 2024 22:44:46 +0530 Subject: [PATCH 58/59] Optimize loading of falcon model checkpoints (#15898) --- models/demos/ttnn_falcon7b/tests/test_falcon_attention.py | 2 +- models/demos/ttnn_falcon7b/tests/test_falcon_decoder.py | 2 +- models/demos/ttnn_falcon7b/tests/test_falcon_mlp.py | 2 +- .../demos/ttnn_falcon7b/tests/test_falcon_rotary_embedding.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/models/demos/ttnn_falcon7b/tests/test_falcon_attention.py b/models/demos/ttnn_falcon7b/tests/test_falcon_attention.py index 89512cca80f1..8cd2c7aa4834 100644 --- a/models/demos/ttnn_falcon7b/tests/test_falcon_attention.py +++ b/models/demos/ttnn_falcon7b/tests/test_falcon_attention.py @@ -33,7 +33,7 @@ def get_model_prefix(layer_index: int = 0): @pytest.fixture(scope="module") def torch_model(): hugging_face_reference_model = transformers.FalconForCausalLM.from_pretrained( - PRETRAINED_MODEL_NAME, low_cpu_mem_usage=True + PRETRAINED_MODEL_NAME, low_cpu_mem_usage=True, device_map="auto" ).eval() state_dict = hugging_face_reference_model.state_dict() filtered_state_dict = strip_state_dict_prefix(state_dict, get_model_prefix()) diff --git a/models/demos/ttnn_falcon7b/tests/test_falcon_decoder.py b/models/demos/ttnn_falcon7b/tests/test_falcon_decoder.py index 243a2a64a798..5e2a769f0eef 100644 --- a/models/demos/ttnn_falcon7b/tests/test_falcon_decoder.py +++ b/models/demos/ttnn_falcon7b/tests/test_falcon_decoder.py @@ -33,7 +33,7 @@ def get_model_prefix(layer_index: int = 0): @pytest.fixture(scope="module") def torch_model(): hugging_face_reference_model = transformers.FalconForCausalLM.from_pretrained( - PRETRAINED_MODEL_NAME, low_cpu_mem_usage=True + PRETRAINED_MODEL_NAME, low_cpu_mem_usage=True, device_map="auto" ).eval() state_dict = hugging_face_reference_model.state_dict() mlp_state_dict = strip_state_dict_prefix(state_dict, get_model_prefix()) diff --git a/models/demos/ttnn_falcon7b/tests/test_falcon_mlp.py b/models/demos/ttnn_falcon7b/tests/test_falcon_mlp.py index 7ff91a2b76d9..ef22acba48cc 100644 --- a/models/demos/ttnn_falcon7b/tests/test_falcon_mlp.py +++ b/models/demos/ttnn_falcon7b/tests/test_falcon_mlp.py @@ -25,7 +25,7 @@ def get_model_prefix(layer_index: int = 0): @pytest.fixture(scope="module") def torch_model(): hugging_face_reference_model = transformers.FalconForCausalLM.from_pretrained( - PRETRAINED_MODEL_NAME, low_cpu_mem_usage=True + PRETRAINED_MODEL_NAME, low_cpu_mem_usage=True, device_map="auto" ).eval() state_dict = hugging_face_reference_model.state_dict() mlp_state_dict = strip_state_dict_prefix(state_dict, get_model_prefix()) diff --git a/models/demos/ttnn_falcon7b/tests/test_falcon_rotary_embedding.py b/models/demos/ttnn_falcon7b/tests/test_falcon_rotary_embedding.py index 29a50fd1bf8b..c90bad66da17 100644 --- a/models/demos/ttnn_falcon7b/tests/test_falcon_rotary_embedding.py +++ b/models/demos/ttnn_falcon7b/tests/test_falcon_rotary_embedding.py @@ -29,7 +29,7 @@ def get_model_prefix(layer_index: int = 0): @pytest.fixture(scope="module") def torch_model(): hugging_face_reference_model = transformers.FalconForCausalLM.from_pretrained( - PRETRAINED_MODEL_NAME, low_cpu_mem_usage=True + PRETRAINED_MODEL_NAME, low_cpu_mem_usage=True, device_map="auto" ).eval() state_dict = hugging_face_reference_model.state_dict() filtered_state_dict = strip_state_dict_prefix(state_dict, get_model_prefix()) From 4d8eb7073ef27f506bab81cd937f8a0dd207c25f Mon Sep 17 00:00:00 2001 From: Dimitri Gnidash <119051828+dimitri-tenstorrent@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:18:23 -0500 Subject: [PATCH 59/59] Fix the docker image tag specified for smoke tests in Release Docker images workflow (#15842) ### Ticket - https://github.com/tenstorrent/tt-metal/issues/15843 ### Problem description The smoke tests should mention not the latest tag but the tag of the version being built. ### What's changed Add {{inputs.version}} to the tag being used to run Smoke tests. Also modified Docker Run Action to take a new parameter called `docker_image_tag` that allows to specify the image. ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/12264928767 - [x] Blackhole Post commit (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/12264932641 - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- .github/actions/docker-run/action.yml | 4 ++++ .github/actions/generate-docker-tag/action.yml | 15 +++++++++++---- .github/workflows/publish-release-image.yaml | 1 + 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.github/actions/docker-run/action.yml b/.github/actions/docker-run/action.yml index 7d6041996037..e1715827eb22 100644 --- a/.github/actions/docker-run/action.yml +++ b/.github/actions/docker-run/action.yml @@ -9,6 +9,9 @@ inputs: description: 'Docker image architecture' required: false default: tt-metalium/ubuntu-20.04-amd64 + docker_version: + description: 'Specify version for the Docker image tag to use.' + required: false docker_username: description: docker login username required: true @@ -38,6 +41,7 @@ runs: uses: ./.github/actions/generate-docker-tag with: image: ${{ inputs.docker_os_arch }} + version: ${{ inputs.docker_version }} - name: Set shell: bash run: | diff --git a/.github/actions/generate-docker-tag/action.yml b/.github/actions/generate-docker-tag/action.yml index 4c8c0a1f3e41..1ba5a1afd6a6 100644 --- a/.github/actions/generate-docker-tag/action.yml +++ b/.github/actions/generate-docker-tag/action.yml @@ -9,17 +9,24 @@ inputs: description: 'Docker image to run commands in - follows os-arch format' required: false default: ubuntu-20.04-amd64 - + version: + description: 'Docker image version' + required: false runs: using: "composite" steps: - name: Determine Docker Tag shell: bash run: | - if [[ "${GITHUB_REF_NAME}" == "main" ]]; then - echo "IMAGE_TAG=latest" >> $GITHUB_ENV + # If the version was provided use it, otherwise, determine what the version should be. + if [ "${{ inputs.version }}" != "" ]; then + echo "IMAGE_TAG=${{ inputs.version }}" >> $GITHUB_ENV else - echo "IMAGE_TAG=dev-${GITHUB_REF_NAME//\//-}" >> $GITHUB_ENV + if [[ "${GITHUB_REF_NAME}" == "main" ]]; then + echo "IMAGE_TAG=latest" >> $GITHUB_ENV + else + echo "IMAGE_TAG=dev-${GITHUB_REF_NAME//\//-}" >> $GITHUB_ENV + fi fi - name: Determine Full Docker Image Tag shell: bash diff --git a/.github/workflows/publish-release-image.yaml b/.github/workflows/publish-release-image.yaml index f0bff5779764..64f8a2f3d291 100644 --- a/.github/workflows/publish-release-image.yaml +++ b/.github/workflows/publish-release-image.yaml @@ -91,6 +91,7 @@ jobs: uses: ./.github/actions/docker-run with: docker_os_arch: tt-metalium-${{ matrix.os }}-amd64-release/${{ matrix.test_group.arch }} + docker_version: ${{ inputs.version }} docker_password: ${{ secrets.GITHUB_TOKEN }} run_args: | ${{ matrix.test_group.cmd }}