From 7b95079dcd4ae066b736c6ea7b2d9f60ad56ad5a Mon Sep 17 00:00:00 2001
From: Bill Teng <135061747+TT-billteng@users.noreply.github.com>
Date: Mon, 14 Oct 2024 15:18:55 -0700
Subject: [PATCH] Revert "Ncvetkovic/0 bh llk test coverage dst acc" (#13784)

Revert "Ncvetkovic/0 bh llk test coverage dst acc (#13293)"

This reverts commit e58dd71d398cd2cac591e42840978c570c4a9a41.
---
 .../eltwise_copy_block_matmul_partials.cpp    |   6 +-
 .../test_kernels/compute/reconfig.cpp         |  21 +-
 .../test_kernels/compute/reduce_h.cpp         |  43 +-
 .../test_kernels/compute/reduce_hw.cpp        |  43 +-
 .../test_kernels/compute/reduce_w.cpp         |  43 +-
 .../test_copy_block_matmul_partials.cpp       | 186 +++----
 .../unit_tests/compute/test_reconfig.cpp      | 261 +++++----
 .../unit_tests/compute/test_reduce.cpp        | 365 ++++++------
 .../compute/test_untilize_tilize.cpp          | 224 +++-----
 .../compute/matmul/test_matmul_X_tile.cpp     | 520 +++++++++---------
 10 files changed, 764 insertions(+), 948 deletions(-)
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp
index dcd0b9f8f8b..62e58295873 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp
@@ -9,6 +9,8 @@
 #include "compute_kernel_api/eltwise_unary/eltwise_unary.h"
 #include "compute_kernel_api.h"
 
+#define START_IN_TILE_ID              (0)
+#define START_DST_TILE_ID             (0)
 namespace NAMESPACE {
 void MAIN {
 
@@ -31,9 +33,9 @@ void MAIN {
         cb_reserve_back(out_cb_id, num_single_transfer);
 
         // Copy num_single_transfer tiles from in_cb to DEST
-        copy_block_matmul_partials(in_cb_id, 0, 0, num_single_transfer);
+        copy_block_matmul_partials(in_cb_id, START_IN_TILE_ID, START_DST_TILE_ID, num_single_transfer);
         // Pack num_single_transfer tiles to out_cb
-        matmul_pack_tile(0, out_cb_id, num_single_transfer);
+        matmul_pack_tile(START_DST_TILE_ID, out_cb_id, num_single_transfer);
 
         // Release DEST reg marking compute/pack complete
         release_dst();
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp
index c3706439495..efbb497fa8a 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp
@@ -2,14 +2,16 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <cstdint>
-
 #include "compute_kernel_api/eltwise_binary.h"
+#include <cstdint>
 #include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
 #include "compute_kernel_api/tile_move_copy.h"
 #include "compute_kernel_api/pack.h"
 #include "compute_kernel_api/reconfig_data_format.h"
 
+#define START_IN_TILE_ID              (0)
+#define START_DST_TILE_ID             (0)
+
 namespace NAMESPACE {
 void MAIN {
     uint32_t num_tiles = get_arg_val<uint32_t>(0);
@@ -18,7 +20,7 @@ void MAIN {
     constexpr auto cb_in0 = tt::CB::c_in0; // Bfp8_b
     constexpr auto cb_in1 = tt::CB::c_in1; // Bfp16_b
     constexpr auto cb_in2 = tt::CB::c_in2; // Bfp16_b
-    constexpr auto cb_out0 = tt::CB::c_out0; // Fp32
+    constexpr auto cb_out0 = tt::CB::c_out0; // Bfp16_b
     constexpr auto cb_out1 = tt::CB::c_out1; // Bfp8_b
 
 
@@ -39,17 +41,10 @@ void MAIN {
         // data inside CB_0, 2nd one inits it to Bfp16_b
         // which is inside CB_2
         copy_tile_init();
-        // This call will test copy_tile_to_dst_init_short as well
         copy_tile_to_dst_init_short_with_dt(cb_in0, cb_in2);
 
         cb_wait_front(cb_in2, ublock_size_tiles);
-#if (BLOCK_COPY == 1)
-        for (uint32_t u_cnt = 0; u_cnt < ublock_size_tiles; u_cnt++) {
-            copy_tile(cb_in2, 0, 0);
-        }
-#elif (BLOCK_COPY == 0)
-        copy_block_matmul_partials(cb_in2, 0, 0, ublock_size_tiles);
-#endif
+        copy_block_matmul_partials(cb_in2, START_IN_TILE_ID, START_DST_TILE_ID, ublock_size_tiles);
         cb_pop_front(cb_in2, ublock_size_tiles);
 
         // -------------------- Addition with acc -----------------------------
@@ -89,7 +84,7 @@ void MAIN {
         pack_reconfig_l1_acc(true);
 #endif
         // Configured already for CB_16, Bfp16_b
-        matmul_pack_tile(0, cb_out0, ublock_size_tiles);
+        matmul_pack_tile(START_DST_TILE_ID, cb_out0, ublock_size_tiles);
         // Reconfig for CB_17, Bfp8_b, then pack to CB_17
 #if (EXPLICIT_RECONFIG == 1)
         // Indices for old_output, new_output
@@ -101,7 +96,7 @@ void MAIN {
         // Not testing for L1 accumulation
         pack_reconfig_l1_acc(false);
 
-        matmul_pack_tile(0, cb_out1, ublock_size_tiles);
+        matmul_pack_tile(START_DST_TILE_ID, cb_out1, ublock_size_tiles);
         release_dst();
 
         cb_pop_front(cb_in0, ublock_size_tiles);
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
index ac87cd4169d..c77a6a5ec58 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
@@ -4,40 +4,10 @@
 
 #include <cstdint>
 
-#include "compute_kernel_api/reduce.h"
-
-/* This dummy initialization function is called prior to reduce_init to ensure proper
- * initialization of the HW and to test reduce_init_short/reduce_init_delta calls.
- *
- * - If SHORT_INIT is defined, this function provides API calls
- *   which initialize the HW properly when supplemented with reduce_init_short or
- *   reduce_init_delta (note that these two inits are the same except for the "at_start"
- *   argument; reference reduce.h for more details).
- * - If SHORT_INIT is not defined, only the PACK configuration function is called with
- *   a negative value of the defined "at_start" template argument because full reduce_init
- *   provides other API calls.
- *
- * If "at_start = 1", the value that is passed to llk_pack_reduce_config_v2 is 0.
- * If "at_start = 0", the value that is passed to llk_pack_reduce_config_v2 is 1.
- *
- * After dummy_init is called, the proper reduce init call will be invoked with the defined
- * value of the argument, not the negated value. This will ensure that the "at_start"
- * argument is tested. Reference llk_pack_reduce_config_v2 for more details.
- */
-template<bool at_start, PoolType reduce_type = REDUCE_OP, ReduceDim reduce_dim = REDUCE_DIM>
-ALWI void dummy_init(uint32_t icb = 0, uint32_t icb_scaler = 1, uint32_t ocb = 16)
-{
-#ifdef SHORT_INIT
-    UNPACK(( llk_unpack_AB_hw_configure_disaggregated<DST_ACCUM_MODE>(icb, icb_scaler) ));
-
-    MATH(( llk_math_pack_sync_init<DST_ACCUM_MODE>() ));
-    MATH(( llk_math_hw_configure_disaggregated() ));
+#include "debug/dprint.h"
 
-    PACK(( llk_pack_init() ));
-    PACK(( llk_pack_dest_init<false, DST_ACCUM_MODE>() ));
-#endif
-    PACK(( llk_pack_reduce_config_v2<reduce_dim, !at_start, false, DST_ACCUM_MODE>(ocb) ));
-}
+#include "compute_kernel_api/reduce.h"
+#include "compute_kernel_api/eltwise_binary.h"
 
 namespace NAMESPACE {
 void MAIN {
@@ -45,12 +15,11 @@ void MAIN {
     constexpr uint32_t Ht = get_compile_time_arg_val(0);
     constexpr uint32_t Wt = get_compile_time_arg_val(1);
     constexpr uint32_t NC = get_compile_time_arg_val(2);
-    constexpr bool at_start = get_compile_time_arg_val(3);
-    dummy_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
 #ifndef SHORT_INIT
-    reduce_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init<true>(tt::CB::c_in0, tt::CB::c_in2);
 #else
-    reduce_init_delta<at_start>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2, tt::CB::c_out0);
+    reduce_init_delta<false>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
 #endif
 
     cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
index 905c2da51e5..9e6ab5c086c 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
@@ -4,40 +4,10 @@
 
 #include <cstdint>
 
-#include "compute_kernel_api/reduce.h"
-
-/* This dummy initialization function is called prior to reduce_init to ensure proper
- * initialization of the HW and to test reduce_init_short/reduce_init_delta calls.
- *
- * - If SHORT_INIT is defined, this function provides API calls
- *   which initialize the HW properly when supplemented with reduce_init_short or
- *   reduce_init_delta (note that these two inits are the same except for the "at_start"
- *   argument; reference reduce.h for more details).
- * - If SHORT_INIT is not defined, only the PACK configuration function is called with
- *   a negative value of the defined "at_start" template argument because full reduce_init
- *   provides other API calls.
- *
- * If "at_start = 1", the value that is passed to llk_pack_reduce_config_v2 is 0.
- * If "at_start = 0", the value that is passed to llk_pack_reduce_config_v2 is 1.
- *
- * After dummy_init is called, the proper reduce init call will be invoked with the defined
- * value of the argument, not the negated value. This will ensure that the "at_start"
- * argument is tested. Reference llk_pack_reduce_config_v2 for more details.
- */
-template<bool at_start, PoolType reduce_type = REDUCE_OP, ReduceDim reduce_dim = REDUCE_DIM>
-ALWI void dummy_init(uint32_t icb = 0, uint32_t icb_scaler = 1, uint32_t ocb = 16)
-{
-#ifdef SHORT_INIT
-    UNPACK(( llk_unpack_AB_hw_configure_disaggregated<DST_ACCUM_MODE>(icb, icb_scaler) ));
-
-    MATH(( llk_math_pack_sync_init<DST_ACCUM_MODE>() ));
-    MATH(( llk_math_hw_configure_disaggregated() ));
+#include "debug/dprint.h"
 
-    PACK(( llk_pack_init() ));
-    PACK(( llk_pack_dest_init<false, DST_ACCUM_MODE>() ));
-#endif
-    PACK(( llk_pack_reduce_config_v2<reduce_dim, !at_start, false, DST_ACCUM_MODE>(ocb) ));
-}
+#include "compute_kernel_api/reduce.h"
+#include "compute_kernel_api/eltwise_binary.h"
 
 namespace NAMESPACE {
 void MAIN {
@@ -45,12 +15,11 @@ void MAIN {
     constexpr uint32_t Ht = get_compile_time_arg_val(0);
     constexpr uint32_t Wt = get_compile_time_arg_val(1);
     constexpr uint32_t NC = get_compile_time_arg_val(2);
-    constexpr bool at_start = get_compile_time_arg_val(3);
-    dummy_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
 #ifndef SHORT_INIT
-    reduce_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init<true>(tt::CB::c_in0, tt::CB::c_in2);
 #else
-    reduce_init_delta<at_start>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2, tt::CB::c_out0);
+    reduce_init_delta<false>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
 #endif
 
     cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp
index e413429e8a3..ededadfb0b2 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp
@@ -4,40 +4,10 @@
 
 #include <cstdint>
 
-#include "compute_kernel_api/reduce.h"
-
-/* This dummy initialization function is called prior to reduce_init to ensure proper
- * initialization of the HW and to test reduce_init_short/reduce_init_delta calls.
- *
- * - If SHORT_INIT is defined, this function provides API calls
- *   which initialize the HW properly when supplemented with reduce_init_short or
- *   reduce_init_delta (note that these two inits are the same except for the "at_start"
- *   argument; reference reduce.h for more details).
- * - If SHORT_INIT is not defined, only the PACK configuration function is called with
- *   a negative value of the defined "at_start" template argument because full reduce_init
- *   provides other API calls.
- *
- * If "at_start = 1", the value that is passed to llk_pack_reduce_config_v2 is 0.
- * If "at_start = 0", the value that is passed to llk_pack_reduce_config_v2 is 1.
- *
- * After dummy_init is called, the proper reduce init call will be invoked with the defined
- * value of the argument, not the negated value. This will ensure that the "at_start"
- * argument is tested. Reference llk_pack_reduce_config_v2 for more details.
- */
-template<bool at_start, PoolType reduce_type = REDUCE_OP, ReduceDim reduce_dim = REDUCE_DIM>
-ALWI void dummy_init(uint32_t icb = 0, uint32_t icb_scaler = 1, uint32_t ocb = 16)
-{
-#ifdef SHORT_INIT
-    UNPACK(( llk_unpack_AB_hw_configure_disaggregated<DST_ACCUM_MODE>(icb, icb_scaler) ));
-
-    MATH(( llk_math_pack_sync_init<DST_ACCUM_MODE>() ));
-    MATH(( llk_math_hw_configure_disaggregated() ));
+#include "debug/dprint.h"
 
-    PACK(( llk_pack_init() ));
-    PACK(( llk_pack_dest_init<false, DST_ACCUM_MODE>() ));
-#endif
-    PACK(( llk_pack_reduce_config_v2<reduce_dim, !at_start, false, DST_ACCUM_MODE>(ocb) ));
-}
+#include "compute_kernel_api/reduce.h"
+#include "compute_kernel_api/eltwise_binary.h"
 
 namespace NAMESPACE {
 void MAIN {
@@ -45,12 +15,11 @@ void MAIN {
     constexpr uint32_t Ht = get_compile_time_arg_val(0);
     constexpr uint32_t Wt = get_compile_time_arg_val(1);
     constexpr uint32_t NC = get_compile_time_arg_val(2);
-    constexpr bool at_start = get_compile_time_arg_val(3);
-    dummy_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
 #ifndef SHORT_INIT
-    reduce_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init<true>(tt::CB::c_in0, tt::CB::c_in2);
 #else
-    reduce_init_delta<at_start>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2, tt::CB::c_out0);
+    reduce_init_delta<false>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
 #endif
 
     cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp
index cb9f7bbaf69..5cd7d0f4f24 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp
@@ -3,30 +3,25 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "device_fixture.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
-
 
 using namespace tt;
-using namespace tt::test_utils;
 
 namespace unit_tests::compute::matmul_partials {
 
 struct CopyBlockMatmulPartialsConfig {
-    uint32_t single_tile_size = 2 * 32 * 32;
-    uint32_t num_tiles = 1;
-    // *_ublock defines no. of tiles finished with single LLK API call:
-    uint32_t reader_ublock = 1;
-    uint32_t writer_ublock = 1;
-    uint32_t compute_ublock = 1;
-    uint32_t src0_cb_index = 0;
-    uint32_t ouput_cb_index = 16;
-    // Whether or not we want the result to be stored in DST in FP32:
-    bool fp32_dest_acc_en = false;
-    // Whether or not to sync full/half DST between MATH and PACK:
-    bool dst_full_sync_en = false;
+    uint32_t single_tile_size;
+    uint32_t num_tiles;
+    uint32_t reader_ublock;
+    uint32_t writer_ublock;
+    uint32_t compute_ublock;
+    uint32_t src0_cb_index;
+    uint32_t ouput_cb_index;
+    bool dst_full_sync_en;
 };
 
 void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const CopyBlockMatmulPartialsConfig& test_config) {
+
+
     ////////////////////////////////////////////////////////////////////////////
     //                      Application Setup
     ////////////////////////////////////////////////////////////////////////////
@@ -54,24 +49,14 @@ void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const
 
     uint32_t src0_cb_index = test_config.src0_cb_index;
     uint32_t num_input_tiles = test_config.reader_ublock;
-
     tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(src0_cb_index, single_tile_size);
-
-    if (test_config.fp32_dest_acc_en) {
-        cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float32}})
-            .set_page_size(src0_cb_index, single_tile_size);
-    }
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
     uint32_t ouput_cb_index = test_config.ouput_cb_index;
     uint32_t num_output_tiles = test_config.writer_ublock;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_size);
-    if (test_config.fp32_dest_acc_en) {
-        cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float32}})
-            .set_page_size(ouput_cb_index, single_tile_size);
-    }
     auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
 
     auto unary_reader_kernel = tt_metal::CreateKernel(
@@ -93,36 +78,21 @@ void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const
         uint(ouput_cb_index) // Output CB idx
     };
 
-    std::map<string, string> defines;
-    if (test_config.fp32_dest_acc_en) {
-        defines["DST_ACCUM_MODE"] = "1";
-    }
     auto eltwise_unary_kernel = tt_metal::CreateKernel(
         program,
         "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block_matmul_partials.cpp",
         core,
-        tt_metal::ComputeConfig{.fp32_dest_acc_en = test_config.fp32_dest_acc_en,
-                                .dst_full_sync_en = test_config.dst_full_sync_en,
-                                .compile_args = compute_kernel_args,
-                                .defines = defines}
+        tt_metal::ComputeConfig{.dst_full_sync_en = test_config.dst_full_sync_en,
+                                .compile_args = compute_kernel_args}
     );
 
+
     ////////////////////////////////////////////////////////////////////////////
     //                      Execute Application
     ////////////////////////////////////////////////////////////////////////////
-    std::vector<uint32_t> src_vec = create_random_vector_of_bfloat16(
-        dram_buffer_size, 100, 0);
-
-    if (test_config.fp32_dest_acc_en) {
-        auto src_vec_float = generate_uniform_random_vector<float>(
-            -100, 100, dram_buffer_size/sizeof(float), 0);
-        for (auto i = 0; i < src_vec.size(); i++) {
-            std::memcpy(&src_vec[i], &src_vec_float[i], sizeof(float));
-            src_vec[i] &= 0xFFFFE000;
-        }
-    }
-
-    tt_metal::detail::WriteToBuffer(src_dram_buffer_bf16, src_vec);
+    std::vector<uint32_t> src_vec_bf16 = create_random_vector_of_bfloat16(
+        dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
+    tt_metal::detail::WriteToBuffer(src_dram_buffer_bf16, src_vec_bf16);
 
     tt_metal::SetRuntimeArgs(
         program,
@@ -153,71 +123,97 @@ void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const
     std::vector<uint32_t> result_vec_bf16;
     tt_metal::detail::ReadFromBuffer(dst_dram_buffer, result_vec_bf16);
 
+
     ////////////////////////////////////////////////////////////////////////////
     //                      Validation & Teardown
     ////////////////////////////////////////////////////////////////////////////
-    EXPECT_EQ(src_vec.size(), result_vec_bf16.size());
-    EXPECT_EQ(src_vec, result_vec_bf16);
+    EXPECT_EQ(src_vec_bf16.size(), result_vec_bf16.size());
+    EXPECT_EQ(src_vec_bf16, result_vec_bf16);
+
+
 }
 } // namespace unit_tests::compute::matmul_partials
 
 ////////////////////////////////////////////////////////////////////////////
-//                             Test Description
+//                             Tests
 // ------------------------------------------------------------------------
-// These tests aim to cover usage of these API calls:
+// These tests aim to cover usage of these calls:
 // - copy_block_matmul_partials
 // - matmul_pack_tile
+//
+// Tests which contain a string in RXWYCZ format in their name cover
+// different scenarios in reader/writer/compute kernel usage. Letters
+// R, W and C represent reader, writer and compute kernel, respectively,
+// while the numbers X, Y and Z represent how many tiles will a kernel
+// move in a single loop iteration. This is important because depending
+// on these numbers, synchronization points are met at different places.
+// Since there can be a maximum of 8 32-by-32 tiles in DEST reg when using
+// half of it (for MATH/PACK sync purporses), highest bandwidth is achieved
+// when all three parameters are 8. It's also possible to enforce MATH/PACK
+// serialization by telling writer to wait for a single tile to be avail-
+// able in output CB.
+//
 ////////////////////////////////////////////////////////////////////////////
+TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR8W8C8) {
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
+            .single_tile_size = 2 * 1024,
+            .num_tiles = 8,
+            .reader_ublock = 8,
+            .writer_ublock = 8,
+            .compute_ublock = 8,
+            .src0_cb_index = 0,
+            .ouput_cb_index = 16,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
+    }
+}
 
-TEST_F(DeviceFixture, ComputeCopyBlockSingle) {
-    for (bool fp32_dest_acc_en : {true, false}) {
-        // FP32 dest acc not possible for GS
-        if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-        for (bool dst_full_sync_en : {true, false}) {
-            log_info(LogTest, "FP32DestAcc = {}, DstSyncFull = {}", fp32_dest_acc_en, dst_full_sync_en);
-            unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
-                .num_tiles = 8,
-                .fp32_dest_acc_en = fp32_dest_acc_en,
-                .dst_full_sync_en = dst_full_sync_en
-            };
-            unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
-        }
+TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR8W8C1) {
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
+            .single_tile_size = 2 * 1024,
+            .num_tiles = 8,
+            .reader_ublock = 8,
+            .writer_ublock = 8,
+            .compute_ublock = 1,
+            .src0_cb_index = 0,
+            .ouput_cb_index = 16,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
     }
 }
-TEST_F(DeviceFixture, ComputeCopyBlockMultiple) {
-    for (bool fp32_dest_acc_en : {true, false}) {
-        // FP32 dest acc not possible for GS
-        if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-        for (bool dst_full_sync_en : {true, false}) {
-            log_info(LogTest, "FP32DestAcc = {}, DstSyncFull = {}", fp32_dest_acc_en, dst_full_sync_en);
-            unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
-                .num_tiles = 8,
-                .reader_ublock = 8,
-                .writer_ublock = 8,
-                .compute_ublock = 8,
-                .fp32_dest_acc_en = fp32_dest_acc_en,
-                .dst_full_sync_en = dst_full_sync_en
-            };
-            unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
-        }
+
+TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR8W1C1) {
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
+            .single_tile_size = 2 * 1024,
+            .num_tiles = 8,
+            .reader_ublock = 8,
+            .writer_ublock = 1,
+            .compute_ublock = 1,
+            .src0_cb_index = 0,
+            .ouput_cb_index = 16,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
     }
 }
 
-TEST_F(DeviceFixture, ComputeCopyBlockComputeBottleneck) {
-    for (bool fp32_dest_acc_en : {true, false}) {
-        // FP32 dest acc not possible for GS
-        if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-        for (bool dst_full_sync_en : {true, false}) {
-            log_info(LogTest, "FP32DestAcc = {}, DstSyncFull = {}", fp32_dest_acc_en, dst_full_sync_en);
-                unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
-                .num_tiles = 8,
-                .reader_ublock = 8,
-                .writer_ublock = 8,
-                .compute_ublock = 1,
-                .fp32_dest_acc_en = fp32_dest_acc_en,
-                .dst_full_sync_en = dst_full_sync_en
-            };
-            unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
-        }
+TEST_F(DeviceFixture, ComputeCopyBlockMatmulPartialsR1W1C1) {
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::matmul_partials::CopyBlockMatmulPartialsConfig test_config = {
+            .single_tile_size = 2 * 1024,
+            .num_tiles = 1,
+            .reader_ublock = 1,
+            .writer_ublock = 1,
+            .compute_ublock = 1,
+            .src0_cb_index = 0,
+            .ouput_cb_index = 16,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        unit_tests::compute::matmul_partials::run_single_core_copy_block_matmul_partials(this->devices_.at(0), test_config);
     }
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
index 95c902c026f..9d48d09ccaa 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <variant>
 #include "device_fixture.hpp"
 #include "tt_metal/common/bfloat8.hpp"
 #include "tt_metal/test_utils/comparison.hpp"
@@ -14,58 +13,38 @@ namespace unit_tests::compute::reconfig {
 
 struct ReconfigConfig {
     size_t num_tiles = 0;
-    // Number of tiles finished with single LLK API call:
     size_t ublock_size_tiles = 0;
-    // Reconfig LLK API calls can either explicitly or implicitly take previous
-    // CB indices; which version of the call is used is defined by this flag:
     bool explicit_reconfig = false;
-    // Some reconfig calls are joined for SrcA/B; whether split or joined calls
-    // are used is defined with this flag:
     bool split_src_reconfig = false;
-    // This flag defines whether regular packing to L1 is used, or the one
-    // where the result is accumulated with the previous value:
     bool l1_acc = false;
-    // Whether or not we want the result to be stored in DST in FP32 and/or
-    // accumulated with previous DST value is controlled with this flag:
-    bool fp32_dest_acc_en = false;
-    // Whether to test with copy_tile or copy_block_matmul_partials is contro-
-    // lled with this flag:
-    bool block_copy = true;
-    // Whether or not to sync full/half DST between MATH and PACK:
     bool dst_full_sync_en = false;
 };
 
-using VariantVectorType = std::variant<std::vector<float>, std::vector<bfloat16>>;
-
 /// @brief Does Dramx3 --> Reader --> CB --> Add with acc --> CB --> Writer --> Dram
 /// @param device
 /// @param test_config - Configuration of the test -- see struct
 /// @return
 bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_config) {
+    bool pass = true;
+
 
     ////////////////////////////////////////////////////////////////////////////
     //                      Application Setup
     ////////////////////////////////////////////////////////////////////////////
-    bool pass = true;
     uint32_t in0_id = 0;
     uint32_t in1_id = 1;
     uint32_t in2_id = 2;
     uint32_t out0_id = 16;
     uint32_t out1_id = 17;
     static float out0_result_old = 0;
-    // Since golden is not perfect, some corner cases for these values will
-    // make the tests fail. However, this is a representative example since
-    // it utilizes the full BFP16 presicion and range:
-    float in0_val = 1.0;
-    float in1_val = 127.0;
-    float in2_val = 0.0078125;
-    uint32_t single_tile_size_fp32 = 4 * 32 * 32;              // Single 32x32 tile size for Float32
-    uint32_t single_tile_size_bfp16b = 2 * 32 * 32;            // Single 32x32 tile size for Float16_b
-    uint32_t single_tile_size_bfp8b = 1 * 32 * 32 + 64;        // Single 32x32 tile size for Bfp8_b
-    uint32_t single_tile_size_out0 = test_config.fp32_dest_acc_en ? single_tile_size_fp32 : single_tile_size_bfp16b;
+    // Since golden is not perfect, don't change these values much
+    float in0_val = 1.8601;
+    float in1_val = 0.0003;
+    float in2_val = 2.03456;
+    uint32_t single_tile_size_bfp16b = 2 * 32 * 32;              // Single 32x32 tile size for Float16_b
+    uint32_t single_tile_size_bfp8b = 1 * 32 * 32 + 64;          // Single 32x32 tile size for Bfp8_b
     const size_t dram_buffer_size_bfp16b = test_config.num_tiles * single_tile_size_bfp16b;
     const size_t dram_buffer_size_bfp8b = test_config.num_tiles * single_tile_size_bfp8b;
-    const size_t dram_buffer_size_out0 = test_config.num_tiles * single_tile_size_out0;
 
     CoreCoord core = {0, 0};
     tt_metal::Program program = tt_metal::CreateProgram();
@@ -76,9 +55,6 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
     tt::tt_metal::InterleavedBufferConfig dram_config_bfp8b{
         .device = device, .size = dram_buffer_size_bfp8b, .page_size = dram_buffer_size_bfp8b, .buffer_type = tt::tt_metal::BufferType::DRAM};
 
-    tt::tt_metal::InterleavedBufferConfig dram_config_out0{
-        .device = device, .size = dram_buffer_size_out0, .page_size = dram_buffer_size_out0, .buffer_type = tt::tt_metal::BufferType::DRAM};
-
     // This will be srcB in Bfp8_b
     auto input0_dram_buffer = CreateBuffer(dram_config_bfp8b);
     uint32_t input0_dram_byte_address = input0_dram_buffer->address();
@@ -94,8 +70,8 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
     uint32_t input2_dram_byte_address = input2_dram_buffer->address();
     auto input2_dram_noc_xy = input2_dram_buffer->noc_coordinates();
 
-    // This will be Output0 in Float32 or Float16_b depending on fp32_dest_acc_en
-    auto output0_dram_buffer = CreateBuffer(dram_config_out0);
+    // This will be Output0 in Float16_b
+    auto output0_dram_buffer = CreateBuffer(dram_config_bfp16b);
     uint32_t output0_dram_byte_address = output0_dram_buffer->address();
     auto output0_dram_noc_xy = output0_dram_buffer->noc_coordinates();
 
@@ -120,8 +96,8 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
     auto l1_input2_cb = tt_metal::CreateCircularBuffer(program, core, l1_input2_cb_config);
 
     tt_metal::CircularBufferConfig l1_output0_cb_config =
-        tt_metal::CircularBufferConfig(dram_buffer_size_out0, {{out0_id, (test_config.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b)}})
-            .set_page_size(out0_id, single_tile_size_out0);
+        tt_metal::CircularBufferConfig(dram_buffer_size_bfp16b, {{out0_id, tt::DataFormat::Float16_b}})
+            .set_page_size(out0_id, single_tile_size_bfp16b);
     auto l1_output0_cb = tt_metal::CreateCircularBuffer(program, core, l1_output0_cb_config);
 
     tt_metal::CircularBufferConfig l1_output1_cb_config =
@@ -132,12 +108,22 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
     vector<uint32_t> compute_kernel_args = {};
     std::map<string, string> defines;
 
-
-    defines["DST_ACCUM_MODE"] = "1"; // Needed always in order for reader kernel to load data from CB2
-    defines["EXPLICIT_RECONFIG"] = test_config.explicit_reconfig ? "1" : "0";
-    defines["SPLIT_SRC_RECONFIG"] = test_config.split_src_reconfig ? "1" : "0";
-    defines["BLOCK_COPY"] = test_config.block_copy ? "1" : "0";
-    defines["L1_ACC"] = test_config.l1_acc ? "1" : "0";
+    defines["DST_ACCUM_MODE"] = "1";
+    if (test_config.explicit_reconfig) {
+        defines["EXPLICIT_RECONFIG"] = "1";
+    } else {
+        defines["EXPLICIT_RECONFIG"] = "0";
+    }
+    if (test_config.split_src_reconfig) {
+        defines["SPLIT_SRC_RECONFIG"] = "1";
+    } else {
+        defines["SPLIT_SRC_RECONFIG"] = "0";
+    }
+    if (test_config.l1_acc) {
+        defines["L1_ACC"] = "1";
+    } else {
+        defines["L1_ACC"] = "0";
+    }
 
     auto reader_kernel = tt_metal::CreateKernel(
         program,
@@ -157,11 +143,8 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
         program,
         "tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp",
         core,
-        tt_metal::ComputeConfig{
-            .fp32_dest_acc_en = test_config.fp32_dest_acc_en,
-            .dst_full_sync_en = test_config.dst_full_sync_en,
-            .compile_args = compute_kernel_args,
-            .defines = defines});
+        tt_metal::ComputeConfig{.dst_full_sync_en = test_config.dst_full_sync_en,
+                                .compile_args = compute_kernel_args, .defines = defines});
 
     SetRuntimeArgs(
         program,
@@ -172,6 +155,7 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
             uint32_t(test_config.ublock_size_tiles),
         });
 
+
     ////////////////////////////////////////////////////////////////////////////
     //                      Stimulus Generation
     ////////////////////////////////////////////////////////////////////////////
@@ -191,6 +175,7 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
             dram_buffer_size_bfp16b,
             in2_val);
 
+
     ////////////////////////////////////////////////////////////////////////////
     //                      Golden Generation
     ////////////////////////////////////////////////////////////////////////////
@@ -202,44 +187,21 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
     // 19 bits since that's the width of srcA/B/FPU. This is why it's
     // float32 in golden. As for golden1, it should be Bfp8_b in the end,
     // but since there's no available conversion from Float16_b to Bfp8_b,
-    // it remains in float and is then converted to Bfp8_b.
+    // it's left in float and then converted to Bfp8_b.
     std::vector<float> temp_golden(input1.size());
-
-    // It's tricky to make a variable-type vector, so create two for each case
-    // of fp32_dest_acc_en, fp32 when true, fp16 when false
-    std::vector<float> golden0_fp32(input1.size());
-    std::vector<bfloat16> golden0_bfp16(input1.size());
-    // This vector will hold unpacked Bfp8 result:
+    std::vector<bfloat16> golden0(input1.size());
     std::vector<float> golden1(input1.size());
-    // This vector will hold packed fp16_b/fp32 result:
-    std::vector<uint32_t> packed_golden0(input1.size());
     for (auto i = 0; i < temp_golden.size(); i++) {
-        // Do temp = SrcA + SrcB:
         temp_golden[i] = input1[i].to_float() + bfloat16(input0[i]).to_float();
-        // Do temp + DST, store in out0 vector depending on fp32_dest_acc_en:
-        if (test_config.fp32_dest_acc_en) {
-            golden0_fp32[i] = temp_golden[i] + input2[i].to_float();
-        } else {
-            golden0_bfp16[i] = bfloat16(temp_golden[i] + input2[i].to_float());
-        }
-        // Do out1 = temp + DST:
+        golden0[i] = bfloat16(temp_golden[i] + input2[i].to_float());
         golden1[i] = bfloat16(temp_golden[i] + input2[i].to_float()).to_float();
-        // Do out0[bfp16] = temp + L1, this makes sense only if not fp32_dest_acc_en:
-        if (test_config.l1_acc && !test_config.fp32_dest_acc_en) {
-            golden0_bfp16[i] = bfloat16(golden0_bfp16[i].to_float() + out0_result_old);
+        if (test_config.l1_acc) {
+            golden0[i] = bfloat16(golden0[i].to_float() + out0_result_old);
         } else {
-            out0_result_old = golden0_bfp16[i].to_float();
-        }
-        // Cast float32 to "packed "uint32 out0 vector if fp32_dest_acc_en:
-        if (test_config.fp32_dest_acc_en) {
-            packed_golden0[i] = std::bit_cast<uint32_t>(golden0_fp32[i]);
+            out0_result_old = golden0[i].to_float();
         }
     }
-    // Pack out0 vector if not fp32_dest_acc_en:
-    if (!test_config.fp32_dest_acc_en) {
-        packed_golden0 = pack_vector<uint32_t, bfloat16>(golden0_bfp16);
-    }
-    // Pack out1 vector:
+    std::vector<uint32_t> packed_golden0 = pack_vector<uint32_t, bfloat16>(golden0);
     std::vector<uint32_t> packed_golden1 = pack_fp32_vec_as_bfp8_tiles(golden1, true, false);
 
     // ////////////////////////////////////////////////////////////////////////////
@@ -297,67 +259,95 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
         dest0_buffer_data,
         packed_golden0,
         [&](const bfloat16& a, const bfloat16& b) {
-            return is_close(a, b, 0.0155f);
+            return is_close(a, b, 0.015f);
         });
     pass &= is_close_packed_vectors<bfloat16, uint32_t>(
         dest1_buffer_data,
         packed_golden1,
         [&](const bfloat16& a, const bfloat16& b) {
-            return is_close(a, b, 0.0155);
+            return is_close(a, b, 0.015f);
         });
 
     return pass;
 }
 }  // namespace unit_tests::compute::binary
 
-////////////////////////////////////////////////////////////////////////////
-//                             Test Description
-// ------------------------------------------------------------------------
-// These tests aim to cover usage of these API calls:
-// - copy_tile_init
-// - copy_tile_to_dst_init_short
-// - copy_tile_to_dst_init_short_with_dt
-// - unpack_reconfig_data_format
-// - unpack_reconfig_data_format_srca
-// - unpack_reconfig_data_format_srcb
-// - pack_reconfig_l1_acc
-////////////////////////////////////////////////////////////////////////////
-
-TEST_F(DeviceFixture, TileCopyReconfigExplicitSplitDstAcc) {
+TEST_F(DeviceFixture, TileCopyReconfigExplicitSplit) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
     }
-    for (bool explicit_reconfig : {true, false}) {
-        for (bool split_src_reconfig : {true, false}) {
-            for (bool fp32_dest_acc_en : {true, false}) {
-                for (bool block_copy : {true, false}) {
-                    for (bool dst_full_sync_en : {true, false}) {
-                        log_info(LogTest, "Block Copy = {}, "
-                                        "Explicit = {}, "
-                                        "Split = {}, "
-                                        "FP32DestAcc = {}"
-                                        "DstSyncFull = {}.",
-                                        block_copy,
-                                        explicit_reconfig,
-                                        split_src_reconfig,
-                                        fp32_dest_acc_en,
-                                        dst_full_sync_en);
-                        unit_tests::compute::reconfig::ReconfigConfig test_config = {
-                            .num_tiles = 1,
-                            .ublock_size_tiles = 1,
-                            .explicit_reconfig = explicit_reconfig,
-                            .split_src_reconfig = split_src_reconfig,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .block_copy = block_copy,
-                            .dst_full_sync_en = dst_full_sync_en
-                        };
-                        for (unsigned int id = 0; id < num_devices_; id++) {
-                            ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
-                        }
-                    }
-                }
-            }
+
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::reconfig::ReconfigConfig test_config = {
+            .num_tiles = 1,
+            .ublock_size_tiles = 1,
+            .explicit_reconfig = true,
+            .split_src_reconfig = true,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        for (unsigned int id = 0; id < num_devices_; id++) {
+            ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+        }
+    }
+}
+
+TEST_F(DeviceFixture, TileCopyReconfigExplicitJoined) {
+    auto arch = this->arch_;
+    if (arch == tt::ARCH::GRAYSKULL) {
+        GTEST_SKIP();
+    }
+
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::reconfig::ReconfigConfig test_config = {
+            .num_tiles = 1,
+            .ublock_size_tiles = 1,
+            .explicit_reconfig = true,
+            .split_src_reconfig = false,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        for (unsigned int id = 0; id < num_devices_; id++) {
+            ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+        }
+    }
+}
+
+TEST_F(DeviceFixture, TileCopyReconfigImplicitSplit) {
+    auto arch = this->arch_;
+    if (arch == tt::ARCH::GRAYSKULL) {
+        GTEST_SKIP();
+    }
+
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::reconfig::ReconfigConfig test_config = {
+            .num_tiles = 1,
+            .ublock_size_tiles = 1,
+            .explicit_reconfig = false,
+            .split_src_reconfig = true,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        for (unsigned int id = 0; id < num_devices_; id++) {
+            ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+        }
+    }
+}
+
+TEST_F(DeviceFixture, TileCopyReconfigImplicitJoined) {
+    auto arch = this->arch_;
+    if (arch == tt::ARCH::GRAYSKULL) {
+        GTEST_SKIP();
+    }
+
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::reconfig::ReconfigConfig test_config = {
+            .num_tiles = 1,
+            .ublock_size_tiles = 1,
+            .explicit_reconfig = false,
+            .split_src_reconfig = false,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        for (unsigned int id = 0; id < num_devices_; id++) {
+            ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
         }
     }
 }
@@ -367,17 +357,20 @@ TEST_F(DeviceFixture, TileCopyReconfigL1Acc) {
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
     }
-    for (bool l1_acc : {true, false}) {
-        for (bool dst_full_sync_en : {true, false}) {
-            log_info(LogTest, "L1 accumulation is {}, DstSyncFull = {}", l1_acc ? "on." : "off.", dst_full_sync_en);
-            unit_tests::compute::reconfig::ReconfigConfig test_config = {
-                .num_tiles = 1,
-                .ublock_size_tiles = 1,
-                .dst_full_sync_en = dst_full_sync_en
-            };
-            for (unsigned int id = 0; id < num_devices_; id++) {
-                ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
-            }
+
+    for (bool dst_full_sync_en : {true, false}) {
+        unit_tests::compute::reconfig::ReconfigConfig test_config = {
+            .num_tiles = 1,
+            .ublock_size_tiles = 1,
+            .dst_full_sync_en = dst_full_sync_en
+        };
+        for (unsigned int id = 0; id < num_devices_; id++) {
+            test_config.l1_acc = false;
+            ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+            log_info(LogTest, "Passed without L1 accumulation");
+            test_config.l1_acc = true;
+            ASSERT_TRUE(unit_tests::compute::reconfig::single_core_reconfig(devices_.at(id), test_config));
+            log_info(LogTest, "Passed with L1 accumulation");
         }
     }
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
index ba147a19ed5..c12dfb809be 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
@@ -51,11 +51,8 @@ struct ReduceConfig {
     std::function<std::vector<uint16_t>(const std::vector<uint16_t>&, const std::vector<uint32_t>&, float, uint8_t, bool)> golden_function;
     std::vector<uint32_t> result_shape;
     bool math_only_reduce = false;
-    // Whether or not we want the result to be stored in DST in FP32:
     bool fp32_dest_acc_en = false;
-    // Whether or not to sync full/half DST between MATH and PACK:
     bool dst_full_sync_en = false;
-    bool at_start = false;
     MathFidelity math_fidelity = MathFidelity::HiFi4;
 };
 
@@ -294,7 +291,6 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
         uint(Ht),
         uint(Wt),
         uint(NC),
-        test_config.at_start,
     };
 
     std::map<string, string> reduce_defines = {
@@ -370,13 +366,6 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
         log_error(LogTest, "Failure position={}", argfail);
 
     EXPECT_TRUE(pass);
-    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}, at_start = {}",
-        test_config.math_fidelity,
-        test_config.reduce_type,
-        test_config.fp32_dest_acc_en,
-        test_config.dst_full_sync_en,
-        test_config.at_start
-    );
 }
 
 } // namespace unit_tests::compute::reduce
@@ -396,25 +385,23 @@ TEST_F(DeviceFixture, ComputeReduceH) {
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::H,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_h,
-                            .result_shape = result_shape,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid),
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::H,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_h,
+                        .result_shape = result_shape,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid),
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
@@ -429,27 +416,25 @@ TEST_F(DeviceFixture, ComputeReduceW) {
         if (math_fid == 1) continue;
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
-                if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::W,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_w,
-                            .result_shape = result_shape,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid),
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::W,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_w,
+                        .result_shape = result_shape,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid),
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
@@ -464,28 +449,26 @@ TEST_F(DeviceFixture, ComputeReduceHW) {
         if (math_fid == 1) continue;
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
-                // Currently fp32 dest unsupported with reduce scalar
-                if (fp32_dest_acc_en) continue;
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::HW,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_hw,
-                            .result_shape = result_shape,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid)
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    // Currently fp32 dest unsupported with reduce scalar
+                    if (fp32_dest_acc_en) continue;
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::HW,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_hw,
+                        .result_shape = result_shape,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid)
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
@@ -505,26 +488,24 @@ TEST_F(DeviceFixture, ComputeReduceHMathOnly) {
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::H,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_h,
-                            .result_shape = result_shape,
-                            .math_only_reduce = true,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid)
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::H,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_h,
+                        .result_shape = result_shape,
+                        .math_only_reduce = true,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid)
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
@@ -539,28 +520,26 @@ TEST_F(DeviceFixture, ComputeReduceWMathOnly) {
         if (math_fid == 1) continue;
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
-                if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::W,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_w,
-                            .result_shape = result_shape,
-                            .math_only_reduce = true,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid)
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::W,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_w,
+                        .result_shape = result_shape,
+                        .math_only_reduce = true,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid)
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
@@ -575,29 +554,27 @@ TEST_F(DeviceFixture, ComputeReduceHWMathOnly) {
         if (math_fid == 1) continue;
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
-                // Currently fp32 dest unsupported with reduce scalar
-                if (fp32_dest_acc_en) continue;
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::HW,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_hw,
-                            .result_shape = result_shape,
-                            .math_only_reduce = true,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid)
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    // Currently fp32 dest unsupported with reduce scalar
+                    if (fp32_dest_acc_en) continue;
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::HW,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_hw,
+                        .result_shape = result_shape,
+                        .math_only_reduce = true,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid)
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
@@ -617,26 +594,24 @@ TEST_F(DeviceFixture, ComputeReduceHShortInit) {
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .short_init = true,
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::H,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_h,
-                            .result_shape = result_shape,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid)
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .short_init = true,
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::H,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_h,
+                        .result_shape = result_shape,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid)
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
@@ -651,28 +626,26 @@ TEST_F(DeviceFixture, ComputeReduceWShortInit) {
         if (math_fid == 1) continue;
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
-                if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .short_init = true,
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::W,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_w,
-                            .result_shape = result_shape,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid)
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .short_init = true,
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::W,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_w,
+                        .result_shape = result_shape,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid)
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
@@ -687,29 +660,27 @@ TEST_F(DeviceFixture, ComputeReduceHWShortInit) {
         if (math_fid == 1) continue;
         for (uint8_t reduce_type = uint8_t(ReduceType::SUM); reduce_type <= uint8_t(ReduceType::MAX); reduce_type++) {
             for (bool fp32_dest_acc_en : {true, false}) {
-                // Currently fp32 dest unsupported with reduce scalar
-                if (fp32_dest_acc_en) continue;
                 for (bool dst_full_sync_en : {true, false}) {
-                    for (bool at_start : {true, false}) {
-                        ReduceConfig test_config = {
-                            .short_init = true,
-                            .shape = shape,
-                            .reduce_dim = ReduceDim::HW,
-                            .reduce_type = ReduceType(reduce_type),
-                            .data_gen_rand_max = 10.0f,
-                            .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
-                            .data_gen_offset = -10.0f,
-                            .atol = 1e-2f,
-                            .rtol = 0.08f,
-                            .golden_function = unit_tests::compute::gold_reduce_hw,
-                            .result_shape = result_shape,
-                            .fp32_dest_acc_en = fp32_dest_acc_en,
-                            .dst_full_sync_en = dst_full_sync_en,
-                            .at_start = at_start,
-                            .math_fidelity = MathFidelity(math_fid)
-                        };
-                        run_single_core_reduce_program(this->devices_.at(0), test_config);
-                    }
+                    // Currently fp32 dest unsupported with reduce scalar
+                    if (fp32_dest_acc_en) continue;
+                    log_info(LogTest, "MathFid = {}, ReduceType = {}, FP32DestAcc = {}, DstSyncFull = {}", math_fid, reduce_type, fp32_dest_acc_en, dst_full_sync_en);
+                    ReduceConfig test_config = {
+                        .short_init = true,
+                        .shape = shape,
+                        .reduce_dim = ReduceDim::HW,
+                        .reduce_type = ReduceType(reduce_type),
+                        .data_gen_rand_max = 10.0f,
+                        .data_gen_seed = std::chrono::system_clock::now().time_since_epoch().count(),
+                        .data_gen_offset = -10.0f,
+                        .atol = 1e-2f,
+                        .rtol = 0.08f,
+                        .golden_function = unit_tests::compute::gold_reduce_hw,
+                        .result_shape = result_shape,
+                        .fp32_dest_acc_en = fp32_dest_acc_en,
+                        .dst_full_sync_en = dst_full_sync_en,
+                        .math_fidelity = MathFidelity(math_fid)
+                    };
+                    run_single_core_reduce_program(this->devices_.at(0), test_config);
                 }
             }
         }
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
index 91071283a51..c96a44be6d2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
@@ -20,6 +20,7 @@
 
 using namespace tt;
 using namespace tt::test_utils;
+using namespace tt::test_utils::df;
 using namespace tt::tt_metal;
 
 namespace unit_tests::compute::tilize {
@@ -41,21 +42,13 @@ using GoldenFunc = std::variant<
     std::function<std::vector<uint32_t>(const std::vector<uint32_t>&, const std::vector<uint32_t>&, const GoldenConfig &config)> >;
 
 struct TestConfig {
-    // Whether or not to use *_init_short LLK API calls:
     bool short_init = false;
-    // Whether or not to sync full/half DST between MATH and PACK:
     bool dst_full_sync_en = false;
-    // Whether or not we want the result to be stored in DST in FP32 is
-    // controlled with this flag:
-    bool fp32_dest_acc_en = false;
     uint32_t input_single_tile_size;
     uint32_t output_single_tile_size;
-    // Block height in tiles:
     uint32_t num_tiles_r;
-    // Block width in tiles:
     uint32_t num_tiles_c;
     uint32_t num_faces_per_tile = 4;
-    // Face height in datums:
     uint32_t face_r_dim = 16;
     std::optional<UntilizeType> untilize_type = std::nullopt;
     std::optional<TilizeType> tilize_type = std::nullopt;
@@ -68,6 +61,8 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
     CoreCoord core = {0, 0};
 
     uint32_t num_tiles = test_config.num_tiles_r * test_config.num_tiles_c;
+    log_info(tt::LogTest, "Running test for num_tiles_r = {}, num_tiles_c = {}", test_config.num_tiles_r, test_config.num_tiles_c);
+
     uint32_t input_dram_buffer_size = test_config.input_single_tile_size * num_tiles;
     uint32_t output_dram_buffer_size = test_config.output_single_tile_size * num_tiles;
 
@@ -118,9 +113,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
 
     uint32_t ouput_cb_index = 16; // output operands start at index 16
     uint32_t num_output_tiles = num_tiles;
-    tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(
-        num_output_tiles * test_config.output_single_tile_size,
-        {{ouput_cb_index, test_config.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b}})
+    tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * test_config.output_single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, test_config.output_single_tile_size);
     auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
 
@@ -130,7 +123,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
     } else if(test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) {
         reader_kernel_path = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp";
     } else {
-        reader_kernel_path = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp";
+        reader_kernel_path = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp";
     }
 
     auto reader_kernel = tt_metal::CreateKernel(
@@ -168,22 +161,17 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
 
     std::map<string, string> defines = {};
 
-    if (test_config.short_init) {
+    if (test_config.short_init)
+    {
         defines["SHORT_INIT"] = "1";
     }
-    if (test_config.fp32_dest_acc_en) {
-        defines["DST_ACCUM_MODE"] = "1";
-    }
 
     auto eltwise_unary_kernel = tt_metal::CreateKernel(
         program,
         compute_kernel,
         core,
-        tt_metal::ComputeConfig{
-            .fp32_dest_acc_en = test_config.fp32_dest_acc_en,
-            .dst_full_sync_en = test_config.dst_full_sync_en,
-            .compile_args = compute_kernel_args,
-            .defines = defines}
+        tt_metal::ComputeConfig{.dst_full_sync_en = test_config.dst_full_sync_en,
+                                .compile_args = compute_kernel_args, .defines = defines}
     );
 
     std::vector<uint32_t> src0_vec = create_arange_vector_of_bfloat16(input_dram_buffer_size, false);
@@ -217,10 +205,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
             {dram_buffer_src0_addr,
             (std::uint32_t)dram_src0_noc_xy.x,
             (std::uint32_t)dram_src0_noc_xy.y,
-            num_tiles,
-            src0_cb_index,
-            test_config.num_tiles_c,
-            false
+            num_tiles
             });
     }
 
@@ -260,23 +245,12 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
         }
     }, test_config.golden_function);
 
-
-    if(test_config.fp32_dest_acc_en) {
-        vector<bfloat16> golden_unpacked = unpack_vector<bfloat16, uint32_t>(golden);
-        // Increasing the size since from BFP16 two times, since storing is in FP32
-        golden.resize(golden.size() * 2);
-        for (auto i = 0; i < golden_unpacked.size(); i++) {
-            // Cast float32 to "packed "uint32 golden vector if fp32_dest_acc_en:
-            golden[i] = std::bit_cast<uint32_t>(golden_unpacked[i].to_float());
-        }
-    }
-
     if(test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) {
         pass &= (golden.size() == result_vec.size());
-        pass &= is_close_packed_vectors<bfloat16, uint32_t>(
+        pass &= is_close_packed_vectors<tt::test_utils::df::bfloat16, uint32_t>(
             result_vec,
             golden,
-            [&](const bfloat16& a, const bfloat16& b) {
+            [&](const tt::test_utils::df::bfloat16& a, const tt::test_utils::df::bfloat16& b) {
                 return is_close(a, b, 0.01f);
             });
 
@@ -287,17 +261,12 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
 
     if (not pass){
         std::cout << "GOLDEN "  << std::endl;
-        print_vector(unpack_vector<bfloat16, uint32_t>(golden));
+        print_vector(unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(golden));
         std::cout << "RESULTS "  << std::endl;
-        print_vector(unpack_vector<bfloat16, uint32_t>(result_vec));
+        print_vector(unpack_vector<tt::test_utils::df::bfloat16, uint32_t>(result_vec));
     }
     ASSERT_TRUE(pass);
-    log_info(tt::LogTest, "Done running test with: num_tiles_r = {}, num_tiles_c = {}, FP32_DestAcc = {}, DstSyncFull = {}, pass = {}",
-            test_config.num_tiles_r,
-            test_config.num_tiles_c,
-            test_config.fp32_dest_acc_en,
-            test_config.dst_full_sync_en,
-            pass);
+    log_info(tt::LogTest, "Done running test for num_tiles_r = {}, num_tiles_c = {}, pass = {}", test_config.num_tiles_r, test_config.num_tiles_c, pass);
 }
 
 } // namespace unit_tests::compute::tilize
@@ -307,24 +276,19 @@ Following tests are for Unpack Tilize
 ***************************************/
 
 TEST_F(DeviceFixture, ComputeUnpackTilize) {
-    vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
+    vector<vector<uint32_t> > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
-        for (bool fp32_dest_acc_en : {true, false}) {
-            // FP32 dest acc not possible for GS and unpack_tilize hangs on BH -> tt-metal/#13640
-            if ((fp32_dest_acc_en == true) && (this->arch_ != tt::ARCH::WORMHOLE_B0)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                unit_tests::compute::tilize::TestConfig test_config = {
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .input_single_tile_size = 2 * 1024,
-                    .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2),
-                    .num_tiles_r = num_tile[0],
-                    .num_tiles_c = num_tile[1],
-                    .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A,
-                    .golden_function = unit_tests::compute::gold_standard_tilize
-                };
-                unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
-            }
+        for (bool dst_full_sync_en : {true, false}) {
+            unit_tests::compute::tilize::TestConfig test_config = {
+                .dst_full_sync_en = dst_full_sync_en,
+                .input_single_tile_size = 2 * 1024,
+                .output_single_tile_size = 2 * 1024,
+                .num_tiles_r = num_tile[0],
+                .num_tiles_c = num_tile[1],
+                .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A,
+                .golden_function = unit_tests::compute::gold_standard_tilize
+            };
+            unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
         }
     }
 }
@@ -334,6 +298,7 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) {
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
     }
+
     for (bool dst_full_sync_en : {true, false}) {
         unit_tests::compute::tilize::TestConfig test_config = {
             .dst_full_sync_en = dst_full_sync_en,
@@ -349,25 +314,20 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) {
 }
 
 TEST_F(DeviceFixture, ComputeUnpackTilizeShortInit) {
-    vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
+    vector<vector<uint32_t> > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
-        for (bool fp32_dest_acc_en : {true, false}) {
-            // FP32 dest acc not possible for GS and unpack_tilize hangs on BH -> tt-metal/#13640
-            if ((fp32_dest_acc_en == true) && (this->arch_ != tt::ARCH::WORMHOLE_B0)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
+        for (bool dst_full_sync_en : {true, false}) {
             unit_tests::compute::tilize::TestConfig test_config = {
                 .short_init = true,
                 .dst_full_sync_en = dst_full_sync_en,
-                .fp32_dest_acc_en = fp32_dest_acc_en,
                 .input_single_tile_size = 2 * 1024,
-                .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2),
+                .output_single_tile_size = 2 * 1024,
                 .num_tiles_r = num_tile[0],
                 .num_tiles_c = num_tile[1],
                 .tilize_type = unit_tests::compute::tilize::TilizeType::UNPACK_A,
                 .golden_function = unit_tests::compute::gold_standard_tilize
             };
             unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
-            }
         }
     }
 }
@@ -377,48 +337,38 @@ Following tests are for Unpack Untilize
 ***************************************/
 
 TEST_F(DeviceFixture, ComputeUnpackUntilize) {
-    vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
+    vector<vector<uint32_t> > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
-        for (bool fp32_dest_acc_en : {true, false}) {
-            // FP32 dest acc not possible for GS
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                unit_tests::compute::tilize::TestConfig test_config = {
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .input_single_tile_size = 2 * 1024,
-                    .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2),
-                    .num_tiles_r = num_tile[0],
-                    .num_tiles_c = num_tile[1],
-                    .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK,
-                    .golden_function = unit_tests::compute::gold_standard_untilize
-                };
-                unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
-            }
+        for (bool dst_full_sync_en : {true, false}) {
+            unit_tests::compute::tilize::TestConfig test_config = {
+                .dst_full_sync_en = dst_full_sync_en,
+                .input_single_tile_size = 2 * 1024,
+                .output_single_tile_size = 2 * 1024,
+                .num_tiles_r = num_tile[0],
+                .num_tiles_c = num_tile[1],
+                .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK,
+                .golden_function = unit_tests::compute::gold_standard_untilize
+            };
+            unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
         }
     }
 }
 
 TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) {
-    vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
+    vector<vector<uint32_t> > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
-        for (bool fp32_dest_acc_en : {true, false}) {
-            // FP32 dest acc not possible for GS
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                unit_tests::compute::tilize::TestConfig test_config = {
-                    .short_init = true,
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .input_single_tile_size = 2 * 1024,
-                    .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2),
-                    .num_tiles_r = num_tile[0],
-                    .num_tiles_c = num_tile[1],
-                    .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK,
-                    .golden_function = unit_tests::compute::gold_standard_untilize
-                };
-                unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
-            }
+        for (bool dst_full_sync_en : {true, false}) {
+            unit_tests::compute::tilize::TestConfig test_config = {
+                .short_init = true,
+                .dst_full_sync_en = dst_full_sync_en,
+                .input_single_tile_size = 2 * 1024,
+                .output_single_tile_size = 2 * 1024,
+                .num_tiles_r = num_tile[0],
+                .num_tiles_c = num_tile[1],
+                .untilize_type = unit_tests::compute::tilize::UntilizeType::UNPACK,
+                .golden_function = unit_tests::compute::gold_standard_untilize
+            };
+            unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
         }
     }
 }
@@ -427,54 +377,44 @@ TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) {
 Following tests are for pack untilize
 ***************************************/
 TEST_F(DeviceFixture, ComputePackUntilize) {
-    vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
+    vector<vector<uint32_t> > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
-        for (bool fp32_dest_acc_en : {true, false}) {
-            // FP32 dest acc not possible for GS
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                unit_tests::compute::tilize::TestConfig test_config = {
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .input_single_tile_size = 2 * 1024,
-                    .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2),
-                    .num_tiles_r = num_tile[0],
-                    .num_tiles_c = num_tile[1],
-                    .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK,
-                    .golden_function = unit_tests::compute::gold_standard_untilize
-                };
-                unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
-            }
+        for (bool dst_full_sync_en : {true, false}) {
+            unit_tests::compute::tilize::TestConfig test_config = {
+                .dst_full_sync_en = dst_full_sync_en,
+                .input_single_tile_size = 2 * 1024,
+                .output_single_tile_size = 2 * 1024,
+                .num_tiles_r = num_tile[0],
+                .num_tiles_c = num_tile[1],
+                .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK,
+                .golden_function = unit_tests::compute::gold_standard_untilize
+            };
+            unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
         }
     }
 }
 
 TEST_F(DeviceFixture, ComputePackUntilizeShortInit) {
-    vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
+    vector<vector<uint32_t> > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
-        for (bool fp32_dest_acc_en : {true, false}) {
-            // FP32 dest acc not possible for GS
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                unit_tests::compute::tilize::TestConfig test_config = {
-                    .short_init = true,
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .input_single_tile_size = 2 * 1024,
-                    .output_single_tile_size = 1024 * (fp32_dest_acc_en ? 4 : 2),
-                    .num_tiles_r = num_tile[0],
-                    .num_tiles_c = num_tile[1],
-                    .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK,
-                    .golden_function = unit_tests::compute::gold_standard_untilize
-                };
-                unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
-            }
+        for (bool dst_full_sync_en : {true, false}) {
+            unit_tests::compute::tilize::TestConfig test_config = {
+                .short_init = true,
+                .dst_full_sync_en = dst_full_sync_en,
+                .input_single_tile_size = 2 * 1024,
+                .output_single_tile_size = 2 * 1024,
+                .num_tiles_r = num_tile[0],
+                .num_tiles_c = num_tile[1],
+                .untilize_type = unit_tests::compute::tilize::UntilizeType::PACK,
+                .golden_function = unit_tests::compute::gold_standard_untilize
+            };
+            unit_tests::compute::tilize::run_single_core_tilize_program(this->devices_.at(0), test_config);
         }
     }
 }
 
 TEST_F(DeviceFixture, ComputePackUntilizeDst) {
-    vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
+    vector<vector<uint32_t> > num_tiles = {{1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
         for (bool dst_full_sync_en : {true, false}) {
             unit_tests::compute::tilize::TestConfig test_config = {
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
index 8a57be5afc7..1ef579e6ed5 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
@@ -19,25 +19,14 @@
 
 using namespace tt;
 using namespace tt::test_utils;
-namespace unit_tests_common::matmul::test_matmul_X_tile{
 
-struct MatmulTileStimuli {
-    vector<bfloat16> t; // Raw tensor values
-    vector<uint32_t> a; // Activations
-    vector<uint32_t> w; // Weights
-};
+namespace unit_tests_common::matmul::test_matmul_X_tile{
 
 struct MatmulTileConfig {
     uint32_t M, K, N;
-    // Whether or not to add matmul result with bias:
     bool with_bias = false;
-    // Whether or not to use *_init_short LLK API calls:
     bool test_init_short = false;
-    // Whether or not to use *_with_dt LLK API init calls:
     bool with_dt = true;
-    // Whether or not we want the result to be stored in DST in FP32:
-    bool fp32_dest_acc_en = false;
-    // Whether or not to sync full/half DST between MATH and PACK:
     bool dst_full_sync_en = false;
     string reader_kernel;
     string compute_kernel;
@@ -45,31 +34,6 @@ struct MatmulTileConfig {
     MathFidelity math_fidelity = MathFidelity::HiFi4;
 };
 
-void create_test_stimuli(MatmulTileStimuli &stimuli, uint32_t M, uint32_t K, uint32_t N) {
-    SHAPE shape = {1, 1, M * 32, K * 32};
-    tt::deprecated::Tensor<bfloat16> tensor = tt::deprecated::initialize_tensor<bfloat16>(
-        shape,
-        tt::deprecated::Initialize::RANDOM,
-        100,
-        std::chrono::system_clock::now().time_since_epoch().count()
-    );
-    stimuli.t = tensor.get_values();
-
-    auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
-    auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
-    auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
-    auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
-    stimuli.a = activations_tile_transposed;
-
-    auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32);
-    auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
-    auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
-    auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
-    stimuli.w = weights;
-
-}
-
-// This function creates bit masks to model math fidelity phases. This will mask the result only.
 void set_math_fid_masks(uint16_t &math_fid_mask, MathFidelity math_fidelity = MathFidelity::HiFi4) {
     auto arch = get_arch_from_string(get_env_arch_name());
     switch (math_fidelity) {
@@ -81,9 +45,11 @@ void set_math_fid_masks(uint16_t &math_fid_mask, MathFidelity math_fidelity = Ma
     }
 }
 
-void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulTileConfig &cfg, vector<uint32_t> activations, vector<uint32_t> weights, vector<bfloat16> tensor_vals){
+bool matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulTileConfig &cfg, vector<uint32_t> activations, vector<std::seed_seq::result_type> weights, deprecated::Tensor<bfloat16> tensor){
+    bool pass = true;
 
     tt_metal::Program program = tt_metal::CreateProgram();
+
     CoreCoord core = {0, 0};
 
     // num_tile == M == N == K in the case of multi_tile, conveniently they were all the same!!
@@ -91,29 +57,25 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
     uint32_t M = cfg.M;
     uint32_t K = cfg.K;
     uint32_t N = cfg.N;
-    uint32_t num_tiles = M * K; // only if M = K = N
-    uint32_t single_tile_size_fp32 = 4 * 32 * 32;   // Single 32x32 tile size for Float32
-    uint32_t single_tile_size_bfp16b = 2 * 32 * 32; // Single 32x32 tile size for Float16_b / Uint16
-    uint32_t single_tile_size_out0 = cfg.fp32_dest_acc_en ? single_tile_size_fp32 : single_tile_size_bfp16b;
-    const size_t dram_buffer_size_bfp16b = num_tiles * single_tile_size_bfp16b;
-    const size_t dram_buffer_size_out0 = num_tiles * single_tile_size_out0;
-
-    tt_metal::InterleavedBufferConfig input_dram_config{
-                .device=device,
-                .size = dram_buffer_size_bfp16b,
-                .page_size = dram_buffer_size_bfp16b,
-                .buffer_type = tt_metal::BufferType::DRAM
-    };
-    tt_metal::InterleavedBufferConfig output_dram_config{
+    uint32_t single_tile_size = 2 * 1024;
+    uint32_t num_tiles = M * K;             // only if M = K = N
+    uint32_t dram_buffer_size = single_tile_size * num_tiles;
+    // for multi_tile case buffer size will vary depending on M, N, K
+    // uint32_t dram_buffer_size_act = single_tile_size * M * K; // num_tiles of FP16_B, hard-coded in the reader/writer kernels
+    // uint32_t dram_buffer_size_weights = single_tile_size * K * N; // num_tiles of FP16_B, hard-coded in the reader/writer kernels
+    // uint32_t dram_buffer_size_out = single_tile_size * M * N; // num_tiles of FP16_B, hard-coded in the reader/writer kernels
+
+
+    tt_metal::InterleavedBufferConfig dram_config{
                 .device=device,
-                .size = dram_buffer_size_out0,
-                .page_size = dram_buffer_size_out0,
+                .size = dram_buffer_size,
+                .page_size = dram_buffer_size,
                 .buffer_type = tt_metal::BufferType::DRAM
     };
 
-    auto src0_dram_buffer = CreateBuffer(input_dram_config);
-    auto src1_dram_buffer = CreateBuffer(input_dram_config);
-    auto dst_dram_buffer = CreateBuffer(output_dram_config);
+    auto src0_dram_buffer = CreateBuffer(dram_config);
+    auto src1_dram_buffer = CreateBuffer(dram_config);
+    auto dst_dram_buffer = CreateBuffer(dram_config);
 
     uint32_t num_input_tiles = 2 * M;
 
@@ -122,13 +84,13 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
     auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();
 
     uint32_t src0_cb_index = 0;
-    tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{src0_cb_index, tt::DataFormat::Float16_b}})
-        .set_page_size(src0_cb_index, single_tile_size_bfp16b);
+    tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
+        .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
     uint32_t src1_cb_index = 1;
-    tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{src1_cb_index, tt::DataFormat::Float16_b}})
-        .set_page_size(src1_cb_index, single_tile_size_bfp16b);
+    tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}})
+        .set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
     std::shared_ptr<tt_metal::Buffer> src2_dram_buffer;
@@ -136,24 +98,24 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
     if (cfg.with_bias) { // with_bias only when M, N, or K > 1
         tt_metal::InterleavedBufferConfig bias_config{
                     .device=device,
-                    .size = single_tile_size_bfp16b * N,
-                    .page_size = single_tile_size_bfp16b * N,
+                    .size = single_tile_size * N,
+                    .page_size = single_tile_size * N,
                     .buffer_type = tt_metal::BufferType::DRAM
         };
         src2_dram_buffer = CreateBuffer(bias_config);
 
         uint32_t src2_cb_index = 2;
-        tt_metal::CircularBufferConfig cb_src2_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{src2_cb_index, tt::DataFormat::Float16_b}})
-            .set_page_size(src2_cb_index, single_tile_size_bfp16b);
+        tt_metal::CircularBufferConfig cb_src2_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src2_cb_index, tt::DataFormat::Float16_b}})
+            .set_page_size(src2_cb_index, single_tile_size);
         auto cb_src2 = tt_metal::CreateCircularBuffer(program, core, cb_src2_config);
-    } else if (cfg.test_init_short) { // This will be dummy input in uint16_t
+    } else if (cfg.test_init_short) {// This will be dummy input in uint16_t
         uint32_t in2_id = 2;
         uint32_t out1_id = 17;
 
         tt_metal::InterleavedBufferConfig dummy_config{
                     .device=device,
-                    .size = single_tile_size_bfp16b * N,
-                    .page_size = single_tile_size_bfp16b * N,
+                    .size = single_tile_size * N,
+                    .page_size = single_tile_size * N,
                     .buffer_type = tt_metal::BufferType::DRAM
         };
 
@@ -164,13 +126,13 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
         dst1_dram_buffer = CreateBuffer(dummy_config);
 
         tt_metal::CircularBufferConfig cb_src2_config =
-        tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{in2_id, tt::DataFormat::UInt16}})
-            .set_page_size(in2_id, single_tile_size_bfp16b);
+        tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{in2_id, tt::DataFormat::UInt16}})
+            .set_page_size(in2_id, single_tile_size);
         auto cb_src2 = tt_metal::CreateCircularBuffer(program, core, cb_src2_config);
 
         tt_metal::CircularBufferConfig cb_dst1_config =
-        tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size_bfp16b, {{out1_id, tt::DataFormat::UInt16}})
-            .set_page_size(out1_id, single_tile_size_bfp16b);
+        tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{out1_id, tt::DataFormat::UInt16}})
+            .set_page_size(out1_id, single_tile_size);
         auto cb_dst1 = tt_metal::CreateCircularBuffer(program, core, cb_dst1_config);
     }
 
@@ -179,14 +141,14 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
     if (cfg.M > 1 || cfg.N > 1 || cfg.K > 1){
         uint32_t intermediate_cb_index = 24;
         std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
-            {ouput_cb_index, (cfg.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b)},
-            {intermediate_cb_index, (cfg.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b)}
+            {ouput_cb_index, tt::DataFormat::Float16_b},
+            {intermediate_cb_index, tt::DataFormat::Float16_b}
         };
 
         CoreRangeSet cores(std::set<CoreRange>{CoreRange(core, core)});
-        tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(dram_buffer_size_out0, partials_and_out_data_format_spec)
-            .set_page_size(ouput_cb_index, single_tile_size_out0)
-            .set_page_size(intermediate_cb_index, single_tile_size_out0);
+        tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_tiles * single_tile_size, partials_and_out_data_format_spec)
+            .set_page_size(ouput_cb_index, single_tile_size)
+            .set_page_size(intermediate_cb_index, single_tile_size);
         auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
 
         reader_l1_args = {
@@ -199,15 +161,14 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
             (std::uint32_t)K,
             (std::uint32_t)M,
             (std::uint32_t)N,
-            (std::uint32_t)(M * single_tile_size_bfp16b),
-            (std::uint32_t)(N * single_tile_size_bfp16b),
+            (std::uint32_t)(M * single_tile_size),
+            (std::uint32_t)(N * single_tile_size),
             cfg.with_bias
         };
     } else {
         uint32_t num_output_tiles = 2;
-        tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size_out0,
-        {{ouput_cb_index, (cfg.fp32_dest_acc_en ? tt::DataFormat::Float32 : tt::DataFormat::Float16_b)}})
-            .set_page_size(ouput_cb_index, single_tile_size_out0);
+        tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
+            .set_page_size(ouput_cb_index, single_tile_size);
         auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
 
         reader_l1_args = {
@@ -220,17 +181,23 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
             1,
             1,
             1,
-            1 * single_tile_size_bfp16b,
-            1 * single_tile_size_bfp16b
+            1 * single_tile_size,
+            1 * single_tile_size
         };
     }
 
     std::map<string, string> compute_defines;
 
-    compute_defines["WITH_DT"] = cfg.with_dt ? "1" : "0";
-    compute_defines["TEST_INIT_SHORT"] = cfg.test_init_short ? "1" : "0";
-    if (cfg.fp32_dest_acc_en)
-        compute_defines["DST_ACCUM_MODE"] = "1";
+    if (cfg.with_dt) {
+        compute_defines["WITH_DT"] = "1";
+    } else {
+        compute_defines["WITH_DT"] = "0";
+    }
+    if (cfg.test_init_short) {
+        compute_defines["TEST_INIT_SHORT"] = "1";
+    } else {
+        compute_defines["TEST_INIT_SHORT"] = "0";
+    }
 
     auto mm_reader_kernel = tt_metal::CreateKernel(
         program,
@@ -248,12 +215,11 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
         program,
         cfg.compute_kernel,
         core,
-        tt_metal::ComputeConfig{
-            .math_fidelity = cfg.math_fidelity,
-            .fp32_dest_acc_en = cfg.fp32_dest_acc_en,
-            .dst_full_sync_en = cfg.dst_full_sync_en,
-            .compile_args = cfg.compute_kernel_args,
-            .defines = compute_defines});
+        tt_metal::ComputeConfig{.math_fidelity = cfg.math_fidelity,
+                                .dst_full_sync_en = cfg.dst_full_sync_en,
+                                .compile_args = cfg.compute_kernel_args,
+                                .defines = compute_defines}
+    );
 
     fixture->WriteBuffer(device, src0_dram_buffer, activations);
     fixture->WriteBuffer(device, src1_dram_buffer, weights);
@@ -268,10 +234,10 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
             (std::uint32_t)dram_src2_noc_xy.x,
             (std::uint32_t)dram_src2_noc_xy.y,
             (std::uint32_t)N,
-            (std::uint32_t)(N * single_tile_size_bfp16b)
+            (std::uint32_t)(N * single_tile_size)
         };
 
-        for (uint32_t arg : bias_args) {
+        for (uint32_t arg: bias_args) {
             reader_l1_args.push_back(arg);
         }
     }
@@ -293,30 +259,27 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
 
     fixture->RunProgram(device, program);
 
-    // This is tilized result, will not be modified
-    std::vector<uint32_t> result_vec;
+    vector<uint32_t> result_vec;
     fixture->ReadBuffer(device, dst_dram_buffer, result_vec);
 
-    std::vector<bfloat16> golden = tensor_vals;
-    std::vector<bfloat16> golden_tilized = test_utils::tilize(golden, M*32, N*32);
-    std::vector<bfloat16> golden_tilized_single = convert_to_tile_layout(golden_tilized);
+    auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec);
+    auto result_flat_layout = convert_to_flat_layout(result_bfp16);
+    auto golden = tensor.get_values();
+    auto result_untilized = test_utils::untilize(result_flat_layout, M*32, N*32);
 
-    std::vector<uint32_t> golden_packed(golden_tilized_single.size());
     uint16_t math_fid_mask = 0xFFFF;
     set_math_fid_masks(math_fid_mask, cfg.math_fidelity);
-    for (auto i = 0; i < golden_tilized.size(); i++) {
-        golden_tilized_single[i] = bfloat16(golden_tilized_single[i].to_uint16() & math_fid_mask);
-        if (cfg.fp32_dest_acc_en) {
-            golden_packed[i] = std::bit_cast<uint32_t>(golden_tilized_single[i].to_float());
-        }
+    // If we're testing LoFi/HiFi2 we generate matching golden (trunc LSB).
+    // Note that this will work only for multiplying with identity matrix
+    for (auto i = 0; i < golden.size(); i++) {
+        golden[i] = bfloat16(golden[i].to_uint16() & math_fid_mask);
     }
-    if (!cfg.fp32_dest_acc_en) {
-        golden_packed = pack_bfloat16_vec_into_uint32_vec(golden_tilized_single);
+    if (cfg.M > 1 || cfg.N > 1 || cfg.K > 1){
+        pass &= (golden == result_untilized);
+    } else {
+        pass &= (golden == result_flat_layout); // src1 is all 0's
     }
 
-    EXPECT_EQ(golden_packed.size(), result_vec.size());
-    EXPECT_EQ(golden_packed, result_vec);
-
     DeallocateBuffer(*src0_dram_buffer);
     DeallocateBuffer(*src1_dram_buffer);
     if (cfg.with_bias || cfg.test_init_short) {
@@ -326,176 +289,225 @@ void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulT
         DeallocateBuffer(*src2_dram_buffer);
     }
     DeallocateBuffer(*dst_dram_buffer);
-
-    tt::log_info(tt::LogTest, "Math Fidelity = {}, FP32_DestAcc = {}, DstSyncFull = {}",
-        cfg.math_fidelity,
-        cfg.fp32_dest_acc_en,
-        cfg.dst_full_sync_en
-    );
+    return pass;
 }
 } // namespace unit_tests_common::matmul::test_matmul_X_tile
 
-using namespace tt::test_utils;
-using namespace unit_tests_common::matmul::test_matmul_X_tile;
-
-/* matmul_config.compute_kernel_args = {
-    // block_tile_dim, within block, how many tiles are on the K dim
-    // dst_tile_rows
-    // dst_tile_cols
-    // block_cnt, across blocks, how many tiles are on the K dim
-    // in0_block_tile_cnt, M * block_tile_dim
-    // in1_block_tile_cnt, N * block_tile_dim
-    // out_block_tile_cnt
-}
-*/
-
 TEST_F(CommonFixture, MatmulSingleTile){
-    for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
-        if (i == 1) continue;
-        for (bool fp32_dest_acc_en : {true, false}) {
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                MatmulTileConfig matmul_config = {
-                    .M = 1, .K = 1, .N = 1,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
-                    .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp",
-                    .compute_kernel_args = {1, 1, 1, 1, 1, 1, 1},
-                    .math_fidelity = MathFidelity(i)
-                };
-                MatmulTileStimuli stimuli;
-                create_test_stimuli(stimuli, 1, 1, 1);
-
-                for(unsigned int id = 0; id < devices_.size(); id++){
-                    matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t);
-                }
+    for (bool dst_full_sync_en : {true, false}) {
+        for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+            if (i == 1) continue;
+            unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+                .M = 1, .K = 1, .N = 1,
+                .dst_full_sync_en = dst_full_sync_en,
+                .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
+                .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp",
+                .compute_kernel_args = {
+                    1, // block_tile_dim
+                    1, // dst_tile_rows
+                    1, // dst_tile_cols
+                    1, // block_cnt
+                    1, // in0_block_tile_cnt
+                    1, // in1_block_tile_cnt
+                    1 // out_block_tile_cnt
+                },
+                .math_fidelity = MathFidelity(i)
+            };
+            SHAPE shape = {1, 1, 32, 32};
+            tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+            tt::deprecated::Tensor<bfloat16> tensor = tt::deprecated::initialize_tensor<bfloat16>(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+            auto activations_tile_layout = convert_to_tile_layout(tensor.get_values());
+            auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+
+            auto identity = create_identity_matrix(32, 32, 32); //bfloat16 32x32 identity
+            auto weights_tile_layout = convert_to_tile_layout(identity);
+            auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+            for(unsigned int id = 0; id < devices_.size(); id++){
+                ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations, weights, tensor));
             }
         }
     }
 }
 
 TEST_F(CommonFixture, MatmulMultiTile){
-    for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
-        if (i == 1) continue;
-        for (bool fp32_dest_acc_en : {true, false}) {
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                uint32_t M = fp32_dest_acc_en ? 2 : 4;
-                uint32_t N = fp32_dest_acc_en ? 2 : 4;
-                uint32_t K = fp32_dest_acc_en ? 2 : 4;
-                MatmulTileConfig matmul_config = {
-                    .M = M, .K = K, .N = N,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
-                    .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp",
-                    .compute_kernel_args = {1, M, N, K, M, N, (M * N), matmul_config.with_bias},
-                    .math_fidelity = MathFidelity(i)
-                };
-                MatmulTileStimuli stimuli;
-                create_test_stimuli(stimuli, M, K, N);
-
-                for(unsigned int id = 0; id < devices_.size(); id++){
-                    matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t);
-                    log_info(LogTest, "Multi tile with no bias passed");
-                    matmul_config.with_bias = true;
-                    matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t);
-                    log_info(LogTest, "Multi tile with bias passed");
-                }
+    for (bool dst_full_sync_en : {true, false}) {
+        for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+            if (i == 1) continue;
+            uint32_t M = 4;
+            uint32_t N = 4;
+            uint32_t K = 4;
+            unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+                .M = M, .K = K, .N = N,
+                .dst_full_sync_en = dst_full_sync_en,
+                .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
+                .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp",
+                .compute_kernel_args = {
+                    1, // block_tile_dim, within block, how many tiles are on the K dim
+                    M, // dst_tile_rows
+                    N, // dst_tile_cols
+                    K, // block_cnt, across blocks, how many tiles are on the K dim
+                    M, // in0_block_tile_cnt, M * block_tile_dim
+                    N, // in1_block_tile_cnt,  N * block_tile_dim
+                    (M * N), // out_block_tile_cnt
+                    matmul_config.with_bias // whether or not to use bias
+                },
+                .math_fidelity = MathFidelity(i)
+            };
+            tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+            SHAPE shape = {1, 1, M * 32, K * 32};
+            tt::deprecated::Tensor<bfloat16> tensor = tt::deprecated::initialize_tensor<bfloat16>(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+            auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
+            auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
+            auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+            auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
+
+            auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
+            auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
+            auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
+            auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+            for(unsigned int id = 0; id < devices_.size(); id++){
+                ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+                log_info(LogTest, "Multi tile with no bias passed");
+                matmul_config.with_bias = true;
+                ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
+                log_info(LogTest, "Multi tile with bias passed");
             }
         }
     }
 }
 
 TEST_F(CommonFixture, MatmulBlock){
-    for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
-        if (i == 1) continue;
-        for (bool fp32_dest_acc_en : {true, false}) {
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                uint32_t M = fp32_dest_acc_en ? 2 : 4;
-                uint32_t N = fp32_dest_acc_en ? 2 : 4;
-                uint32_t K = fp32_dest_acc_en ? 2 : 4;
-                MatmulTileConfig matmul_config = {
-                    .M = M, .K = K, .N = N,
-                    .test_init_short = false,
-                    .with_dt = false,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
-                    .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
-                    .compute_kernel_args = {1, M, N, K, M, N, (M * N)},
-                    .math_fidelity = MathFidelity(i)
-                };
-                MatmulTileStimuli stimuli;
-                create_test_stimuli(stimuli, M, K, N);
-
-                for(unsigned int id = 0; id < devices_.size(); id++){
-                    matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t);
-                }
+    for (bool dst_full_sync_en : {true, false}) {
+        for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+            if (i == 1) continue;
+            uint32_t M = 4;
+            uint32_t N = 4;
+            uint32_t K = 4;
+            unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+                .M = M, .K = K, .N = N,
+                .test_init_short = false,
+                .with_dt = false,
+                .dst_full_sync_en = dst_full_sync_en,
+                .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
+                .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
+                .compute_kernel_args = {
+                    1, // block_tile_dim, within block, how many tiles are on the K dim
+                    M, // dst_tile_rows
+                    N, // dst_tile_cols
+                    K, // block_cnt, across blocks, how many tiles are on the K dim
+                    M, // in0_block_tile_cnt, M * block_tile_dim
+                    N, // in1_block_tile_cnt,  N * block_tile_dim
+                    (M * N), // out_block_tile_cnt
+                },
+                .math_fidelity = MathFidelity(i)
+            };
+            tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+            SHAPE shape = {1, 1, M * 32, K * 32};
+            tt::deprecated::Tensor<bfloat16> tensor = tt::deprecated::initialize_tensor<bfloat16>(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+            auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
+            auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
+            auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+            auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
+
+            auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
+            auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
+            auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
+            auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+            for(unsigned int id = 0; id < devices_.size(); id++){
+                ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
             }
         }
     }
 }
 
 TEST_F(CommonFixture, MatmulBlockInitShort){
-    for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
-        if (i == 1) continue;
-        for (bool fp32_dest_acc_en : {true, false}) {
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                uint32_t M = fp32_dest_acc_en ? 2 : 4;
-                uint32_t N = fp32_dest_acc_en ? 2 : 4;
-                uint32_t K = fp32_dest_acc_en ? 2 : 4;
-                MatmulTileConfig matmul_config = {
-                    .M = M, .K = K, .N = N,
-                    .test_init_short = true,
-                    .with_dt = false,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
-                    .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
-                    .compute_kernel_args = {1, M, N, K, M, N, (M * N)},
-                    .math_fidelity = MathFidelity(i)
-                };
-                MatmulTileStimuli stimuli;
-                create_test_stimuli(stimuli, M, K, N);
-
-                for(unsigned int id = 0; id < devices_.size(); id++){
-                    matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t);
-                }
+    for (bool dst_full_sync_en : {true, false}) {
+        for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+            if (i == 1) continue;
+            uint32_t M = 4;
+            uint32_t N = 4;
+            uint32_t K = 4;
+            unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+                .M = M, .K = K, .N = N,
+                .test_init_short = true,
+                .with_dt = false,
+                .dst_full_sync_en = dst_full_sync_en,
+                .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
+                .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
+                .compute_kernel_args = {
+                    1, // block_tile_dim, within block, how many tiles are on the K dim
+                    M, // dst_tile_rows
+                    N, // dst_tile_cols
+                    K, // block_cnt, across blocks, how many tiles are on the K dim
+                    M, // in0_block_tile_cnt, M * block_tile_dim
+                    N, // in1_block_tile_cnt,  N * block_tile_dim
+                    (M * N), // out_block_tile_cnt
+                },
+                .math_fidelity = MathFidelity(i)
+            };
+            tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+            SHAPE shape = {1, 1, M * 32, K * 32};
+            tt::deprecated::Tensor<bfloat16> tensor = tt::deprecated::initialize_tensor<bfloat16>(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+            auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
+            auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
+            auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+            auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
+
+            auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
+            auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
+            auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
+            auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+            for(unsigned int id = 0; id < devices_.size(); id++){
+                ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
             }
         }
     }
 }
 
 TEST_F(CommonFixture, MatmulBlockInitShortWithDt){
-    for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
-        if (i == 1) continue;
-        for (bool fp32_dest_acc_en : {true, false}) {
-            if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
-            for (bool dst_full_sync_en : {true, false}) {
-                uint32_t M = fp32_dest_acc_en ? 2 : 4;
-                uint32_t N = fp32_dest_acc_en ? 2 : 4;
-                uint32_t K = fp32_dest_acc_en ? 2 : 4;
-                MatmulTileConfig matmul_config = {
-                    .M = M, .K = K, .N = N,
-                    .test_init_short = true,
-                    .with_dt = true,
-                    .fp32_dest_acc_en = fp32_dest_acc_en,
-                    .dst_full_sync_en = dst_full_sync_en,
-                    .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
-                    .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
-                    .compute_kernel_args = {1, M, N, K, M, N, (M * N)},
-                    .math_fidelity = MathFidelity(i)
-                };
-                MatmulTileStimuli stimuli;
-                create_test_stimuli(stimuli, M, K, N);
-
-                for(unsigned int id = 0; id < devices_.size(); id++){
-                    matmul_tile(this, devices_.at(id), matmul_config, stimuli.a, stimuli.w, stimuli.t);
-                }
+    for (bool dst_full_sync_en : {true, false}) {
+        for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
+            if (i == 1) continue;
+            uint32_t M = 4;
+            uint32_t N = 4;
+            uint32_t K = 4;
+            unit_tests_common::matmul::test_matmul_X_tile::MatmulTileConfig matmul_config = {
+                .M = M, .K = K, .N = N,
+                .test_init_short = true,
+                .with_dt = true,
+                .dst_full_sync_en = dst_full_sync_en,
+                .reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp",
+                .compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp",
+                .compute_kernel_args = {
+                    1, // block_tile_dim, within block, how many tiles are on the K dim
+                    M, // dst_tile_rows
+                    N, // dst_tile_cols
+                    K, // block_cnt, across blocks, how many tiles are on the K dim
+                    M, // in0_block_tile_cnt, M * block_tile_dim
+                    N, // in1_block_tile_cnt,  N * block_tile_dim
+                    (M * N), // out_block_tile_cnt
+                },
+                .math_fidelity = MathFidelity(i)
+            };
+            tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
+            SHAPE shape = {1, 1, M * 32, K * 32};
+            tt::deprecated::Tensor<bfloat16> tensor = tt::deprecated::initialize_tensor<bfloat16>(shape, tt::deprecated::Initialize::RANDOM, 100, std::chrono::system_clock::now().time_since_epoch().count());
+            auto activations_tilized = test_utils::tilize(tensor.get_values(), M * 32, K * 32);
+            auto activations_tile_layout = convert_to_tile_layout(activations_tilized);
+            auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout);
+            auto activations_tile_transposed = transpose_tiles(activations, M, K, 1);
+
+            auto identity = create_identity_matrix(K * 32, N * 32, std::min(K, N) * 32); //bfloat16 32x32 identity
+            auto identity_tilized = test_utils::tilize(identity, K * 32, N * 32);
+            auto weights_tile_layout = convert_to_tile_layout(identity_tilized);
+            auto weights = pack_bfloat16_vec_into_uint32_vec(weights_tile_layout);
+
+            for(unsigned int id = 0; id < devices_.size(); id++){
+                ASSERT_TRUE(unit_tests_common::matmul::test_matmul_X_tile::matmul_tile(this, devices_.at(id), matmul_config, activations_tile_transposed, weights, tensor));
             }
         }
     }