diff --git a/tests/tt_metal/tt_metal/test_3x3conv_as_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_3x3conv_as_matmul_large_block.cpp
index 0413f01f6a2..b647008c843 100644
--- a/tests/tt_metal/tt_metal/test_3x3conv_as_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_3x3conv_as_matmul_large_block.cpp
@@ -200,7 +200,7 @@ int main(int argc, char **argv) {
 
         auto generic_binary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/generic_binary_reader_blocked.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
@@ -218,7 +218,7 @@ int main(int argc, char **argv) {
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unswizzle.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -251,7 +251,7 @@ int main(int argc, char **argv) {
 
         auto mm_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_add_two_ints.cpp b/tests/tt_metal/tt_metal/test_add_two_ints.cpp
index b4524d82c47..6da67a50792 100644
--- a/tests/tt_metal/tt_metal/test_add_two_ints.cpp
+++ b/tests/tt_metal/tt_metal/test_add_two_ints.cpp
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
         std::vector<uint32_t> second_runtime_args = {303, 606};
 
         tt_metal::KernelID add_two_ints_kernel = tt_metal::CreateDataMovementKernel(
-            program, "tt_metal/kernels/riscv_draft/add_two_ints.cpp", core,
+            program, "tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp", core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
         ////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp
index f905e891c87..eb5cadd3ff9 100644
--- a/tests/tt_metal/tt_metal/test_bcast.cpp
+++ b/tests/tt_metal/tt_metal/test_bcast.cpp
@@ -28,16 +28,16 @@ const char* get_reader_name(bool multibank, BcastDim::Enum bcast_dim) {
     TT_ASSERT(multibank && "Only multibank is supported correctly.");
     if (bcast_dim == BcastDim::H) {
         return multibank ?
-            "tt_metal/kernels/dataflow/reader_bcast_h_8bank.cpp" :
-            "tt_metal/kernels/dataflow/reader_bcast_h.cpp";
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp" :
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp";
     } else if (bcast_dim == BcastDim::W) {
         return multibank ?
-            "tt_metal/kernels/dataflow/reader_bcast_w_8bank.cpp" :
-            "tt_metal/kernels/dataflow/reader_bcast_w.cpp";
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp" :
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp";
     } if (bcast_dim == BcastDim::HW) {
         return multibank ?
-            "tt_metal/kernels/dataflow/reader_bcast_hw_8bank.cpp" :
-            "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp";
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp" :
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp";
     }
     TT_ASSERT(false && "Unexpected bcast_dim!");
     return "";
@@ -45,9 +45,9 @@ const char* get_reader_name(bool multibank, BcastDim::Enum bcast_dim) {
 
 const char* get_compute_name(BcastDim::Enum bcast_dim) {
     switch (bcast_dim) {
-        case BcastDim::H:  return "tt_metal/kernels/compute/bcast_h.cpp";
-        case BcastDim::W:  return "tt_metal/kernels/compute/bcast_w.cpp";
-        case BcastDim::HW: return "tt_metal/kernels/compute/bcast_hw.cpp";
+        case BcastDim::H:  return "tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp";
+        case BcastDim::W:  return "tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp";
+        case BcastDim::HW: return "tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp";
         default:           TT_ASSERT(false && "Unexpected bcast_dim!");
     }
     return "";
@@ -214,8 +214,8 @@ int main(int argc, char **argv) {
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp"
-                      : "tt_metal/kernels/dataflow/writer_unary.cpp",
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp"
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
diff --git a/tests/tt_metal/tt_metal/test_bmm.cpp b/tests/tt_metal/tt_metal/test_bmm.cpp
index f4d42cca314..effdf8c57fd 100644
--- a/tests/tt_metal/tt_metal/test_bmm.cpp
+++ b/tests/tt_metal/tt_metal/test_bmm.cpp
@@ -77,13 +77,13 @@ int main(int argc, char **argv) {
         std::vector<uint32_t> writer_compile_time_args = {(uint32_t)dst_is_dram};
         auto reader = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_bmm_8bank.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bmm_8bank.cpp",
             core,
             DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = reader_compile_time_args});
 
         auto writer = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_bmm_8bank.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_bmm_8bank.cpp",
             core,
             DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = writer_compile_time_args});
 
@@ -96,7 +96,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_binary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/bmm.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_compile_args.cpp
index 096f4171cea..b842da46505 100644
--- a/tests/tt_metal/tt_metal/test_compile_args.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_args.cpp
@@ -35,13 +35,13 @@ bool test_compile_args(std::vector<uint32_t> compile_args_vec, int device_id) {
 
     tt_metal::KernelID unary_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/test_compile_args.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/test_compile_args.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = compile_args_vec});
 
 
     tt_metal::KernelID unary_writer_kernel = tt_metal::CreateDataMovementKernel(
-        program, "tt_metal/kernels/dataflow/blank.cpp",
+        program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -50,7 +50,7 @@ bool test_compile_args(std::vector<uint32_t> compile_args_vec, int device_id) {
     };
 
     auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
-        program, "tt_metal/kernels/compute/blank.cpp",
+        program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp",
         core, tt_metal::ComputeConfig{.compile_args = compute_args});
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp
index cd658ef13ee..e0f43f8f2d0 100644
--- a/tests/tt_metal/tt_metal/test_compile_program.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_program.cpp
@@ -114,13 +114,13 @@ Program create_program(Device *device, const ProgramAttributes &program_attribut
 
     auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_unary_push_4.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = program_attributes.reader_processor, .noc = program_attributes.reader_noc});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = program_attributes.writer_processor, .noc = program_attributes.writer_noc});
 
@@ -130,7 +130,7 @@ Program create_program(Device *device, const ProgramAttributes &program_attribut
 
     auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy_3m.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
         core,
         tt_metal::ComputeConfig{
             .math_fidelity = program_attributes.math_fidelity,
diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
index 0dbed1a3a72..c674bf1936e 100644
--- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
@@ -86,13 +86,13 @@ int main(int argc, char **argv) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary_push_4.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
             core,
             DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
 
@@ -102,7 +102,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_copy_3m.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp
index a02ba4f6555..7b44f1ea4ba 100644
--- a/tests/tt_metal/tt_metal/test_core_range_set.cpp
+++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp
@@ -105,13 +105,13 @@ bool test_program_specified_with_core_range_set(tt_metal::Device *device, tt_met
 
     auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_unary_push_4.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
         core_range_set,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
         core_range_set,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -122,7 +122,7 @@ bool test_program_specified_with_core_range_set(tt_metal::Device *device, tt_met
 
     auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy_3m.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
         core_range_set,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_datacopy.cpp b/tests/tt_metal/tt_metal/test_datacopy.cpp
index ba077f7475a..003e4b0f157 100644
--- a/tests/tt_metal/tt_metal/test_datacopy.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy.cpp
@@ -77,13 +77,13 @@ int main(int argc, char **argv) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary_push_4.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -93,7 +93,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_copy_3m.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
index 001519e5b4d..e784f731266 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
@@ -69,13 +69,13 @@ int main(int argc, char **argv) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -85,7 +85,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_copy_3m.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp
index 8da5cd41a4d..89d1380ef96 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_multi_core_multi_dram.cpp
@@ -141,13 +141,13 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> create_pro
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_copy_tile_layout.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_copy_tile_layout.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_copy_tile_layout.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_copy_tile_layout.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -158,7 +158,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> create_pro
 
     auto compute_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy_block.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
index f5b2c12a7c6..4ab7de7cc0a 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
@@ -75,13 +75,13 @@ int main(int argc, char **argv) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary_push_4.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -91,7 +91,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_copy_3m.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp
index 494e24bb9f4..4aaf60897e7 100644
--- a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp
+++ b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp
@@ -80,13 +80,13 @@ int main(int argc, char **argv) {
 
         auto reader_cb_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_cb_test.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_cb_kernel_args});
 
         auto writer_cb_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_cb_test.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = writer_cb_kernel_args});
 
diff --git a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp
index b6d6d0bb3bf..7ceaa114d74 100644
--- a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp
@@ -74,7 +74,7 @@ int main(int argc, char **argv) {
         }
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/dram_copy_sticks.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp",
             all_cores,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp
index 790678fd0a3..93bd8b33719 100644
--- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp
@@ -85,14 +85,14 @@ int main(int argc, char **argv) {
         // Loader (producer kernel) running on BRISC on logical core {0, 0}
         auto producer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/dram_loader_sync.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp",
             loader_logical_core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
         // Writer (consumer kernel) running on NCRISC on logical core {0, 1}
         auto consumer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/remote_read_remote_write_sync.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp",
             writer_logical_core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp
index ebd1d23e235..2a961b09686 100644
--- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp
@@ -99,14 +99,14 @@ int main(int argc, char **argv) {
         // Loader (producer kernel) running on BRISC on logical core {0, 0}
         auto producer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/dram_loader_sync_db.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp",
             loader_logical_core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
         // Writer (consumer kernel) running on NCRISC on logical core {0, 1}
         auto consumer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/remote_read_remote_write_sync_db.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp",
             writer_logical_core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp
index 1a1b61b5954..a040fe95937 100644
--- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp
@@ -57,7 +57,7 @@ int main(int argc, char **argv) {
 
         auto dram_copy_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/dram_copy.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp
index 0059141c2ef..a0581c2b352 100644
--- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp
@@ -59,7 +59,7 @@ int main(int argc, char **argv) {
 
         auto dram_copy_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/dram_copy_db.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp
index 34d7e41e02b..727fbb6c5f3 100644
--- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp
@@ -77,7 +77,7 @@ int main(int argc, char **argv) {
         log_info(LogTest, "End = {}, {}", core_end_physical.x, core_end_physical.y);
         auto mcast_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/dram_to_l1_multicast.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp
index fb0f3b0fbad..f4a0ba6d9c3 100644
--- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp
+++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp
@@ -73,7 +73,7 @@ int main(int argc, char **argv) {
         log_info(LogTest, "End = {}, {}", core_end_physical.x, core_end_physical.y);
         auto mcast_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/dram_to_l1_multicast_include_src.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
index 93512dcc988..36129982b4d 100644
--- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
@@ -107,15 +107,15 @@ int main(int argc, char** argv) {
 
             auto binary_reader_kernel = tt_metal::CreateDataMovementKernel(
                 program,
-                multibank ? "tt_metal/kernels/dataflow/reader_dual_8bank.cpp"
-                          : "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp",
+                multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp"
+                          : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp",
                 core,
                 tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
             auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
                 program,
-                multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp"
-                          : "tt_metal/kernels/dataflow/writer_unary.cpp",
+                multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp"
+                          : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
                 core,
                 tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -130,7 +130,7 @@ int main(int argc, char** argv) {
             };
             auto eltwise_binary_kernel = tt_metal::CreateComputeKernel(
                 program,
-                "tt_metal/kernels/compute/eltwise_binary.cpp",
+                "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp",
                 core,
                 tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines});
 
diff --git a/tests/tt_metal/tt_metal/test_flatten.cpp b/tests/tt_metal/tt_metal/test_flatten.cpp
index c8bcfc46c61..461095cfac2 100644
--- a/tests/tt_metal/tt_metal/test_flatten.cpp
+++ b/tests/tt_metal/tt_metal/test_flatten.cpp
@@ -122,13 +122,13 @@ int main(int argc, char **argv) {
 
         auto flatten_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/flatten.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -138,7 +138,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_copy.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
index 1f142a68ed4..facce24c0dd 100644
--- a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
@@ -238,7 +238,7 @@ int main(int argc, char **argv) {
 
         auto generic_binary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/generic_binary_reader_blocked.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
@@ -256,7 +256,7 @@ int main(int argc, char **argv) {
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unswizzle.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -289,7 +289,7 @@ int main(int argc, char **argv) {
 
         auto mm_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_graph_interpreter.cpp b/tests/tt_metal/tt_metal/test_graph_interpreter.cpp
index 87a0441b58d..e22bb7ae1e7 100644
--- a/tests/tt_metal/tt_metal/test_graph_interpreter.cpp
+++ b/tests/tt_metal/tt_metal/test_graph_interpreter.cpp
@@ -36,9 +36,9 @@ void run_compile_blank(tt_metal::Device *device) {
         .dummy = 0,
     };
     build_kernel_for_riscv_options.set_hlk_args_all_cores(hlk_args, sizeof(blank::hlk_args_t));
-    build_kernel_for_riscv_options.set_hlk_file_name_all_cores("tt_metal/kernels/compute/blank.cpp");
-    build_kernel_for_riscv_options.ncrisc_kernel_file_name = "tt_metal/kernels/dataflow/blank.cpp";
-    build_kernel_for_riscv_options.brisc_kernel_file_name = "tt_metal/kernels/dataflow/blank.cpp";
+    build_kernel_for_riscv_options.set_hlk_file_name_all_cores("tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp");
+    build_kernel_for_riscv_options.ncrisc_kernel_file_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp";
+    build_kernel_for_riscv_options.brisc_kernel_file_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp";
 
     generate_binaries_params_t params;
     tt_metal::detail::GenerateDeviceHeaders(device, &build_kernel_for_riscv_options, build_kernel_for_riscv_options.name);
@@ -189,13 +189,13 @@ bool run_chained_sfpu_test(int chain_length) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -206,7 +206,7 @@ bool run_chained_sfpu_test(int chain_length) {
 
         auto graph_interpreter_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/graph_interpreter.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
@@ -403,13 +403,13 @@ bool run_binary_add_and_then_eltwise_gelu_test() {
 
         auto binary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_binary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -420,7 +420,7 @@ bool run_binary_add_and_then_eltwise_gelu_test() {
 
         auto graph_interpreter_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/graph_interpreter.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
@@ -642,13 +642,13 @@ bool run_forked_binary_test() {
 
         auto nary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_nary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_nary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -659,7 +659,7 @@ bool run_forked_binary_test() {
 
         auto graph_interpreter_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/graph_interpreter.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
index ab0841f70a4..96b5c38b244 100644
--- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
+++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
@@ -125,13 +125,13 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt:
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary_stick_layout_8bank.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_stick_layout_8bank.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = {1}});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -141,7 +141,7 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt:
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_copy.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
@@ -291,13 +291,13 @@ bool interleaved_tilized_reader_interleaved_stick_writer_datacopy_test(const tt:
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary_stick_layout_8bank.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_stick_layout_8bank.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = {1}});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary_stick_layout_8bank.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_stick_layout_8bank.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -307,7 +307,7 @@ bool interleaved_tilized_reader_interleaved_stick_writer_datacopy_test(const tt:
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_copy.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
@@ -410,13 +410,13 @@ bool test_interleaved_l1_datacopy(const tt::ARCH& arch) {
 
     auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_unary_8bank.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = {not src_is_in_l1}});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary_8bank.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default, .compile_args = {not dst_is_in_l1}});
 
@@ -424,7 +424,7 @@ bool test_interleaved_l1_datacopy(const tt::ARCH& arch) {
     vector<uint32_t> compute_kernel_args = { num_pages };
     auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
         core,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp
new file mode 100644
index 00000000000..c68f206eb31
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_math.cpp
@@ -0,0 +1,124 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "llk_math_common.h"
+#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_eltwise_unary_datacopy.h"
+#include "llk_math_matmul.h"
+namespace NAMESPACE
+{
+
+inline void tilize_activation(
+    uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks) {
+    llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE, false>();
+    for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) {
+        for (uint32_t i = 0U; i < in0_subblock_h; i++) {
+            for (uint32_t j = 0U; j < in0_block_w; j++) {
+                llk_math_wait_for_dest_available<SyncHalf>();
+                llk_math_eltwise_unary_datacopy<A2D, BroadcastType::NONE, SyncHalf>(0);
+                llk_math_dest_section_done<SyncHalf>();
+            }
+        }
+    }
+}
+
+inline void reblock_and_untilize_output(uint32_t out_subblock_h, uint32_t out_block_w) {
+    llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE, false>();
+
+    for (uint32_t i = 0; i < out_subblock_h; i++) {
+        for (int j = 0; j < 2; j++) {
+            for (uint32_t k = 0; k < out_block_w; k++) {
+                llk_math_wait_for_dest_available<SyncHalf>();
+                llk_math_eltwise_unary_datacopy<A2D, BroadcastType::NONE, SyncHalf>(0);
+                llk_math_dest_section_done<SyncHalf>();
+            }
+        }
+    }
+}
+
+void math_main()
+{
+uint32_t in0_block_w = get_compile_time_arg_val(0);
+llk_math_pack_sync_init<SyncHalf>();
+
+// inner block size in tiles
+uint32_t in0_num_subblocks = get_compile_time_arg_val(1);
+// outer row block size (in inner row blocks)
+uint32_t in0_block_num_tiles = get_compile_time_arg_val(2);
+// out_subblock_h*in0_block_w*in0_num_subblocks;
+uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);
+
+uint32_t in0_subblock_h = get_compile_time_arg_val(4);
+
+// out_subblock_h*in0_block_w
+uint32_t in1_num_subblocks = get_compile_time_arg_val(5);
+// outer column block size (in inner column blocks)
+uint32_t in1_block_num_tiles = get_compile_time_arg_val(6);
+//out_subblock_w*in0_block_w* in1_num_subblocks;
+uint32_t in1_per_core_w = get_compile_time_arg_val(7);
+// out_subblock_w*in1_num_subblocks
+constexpr uint32_t num_blocks = get_compile_time_arg_val(8);
+// outer inner dim (in inner dim blocks)
+uint32_t out_subblock_h = get_compile_time_arg_val(9);
+// inner row block size in tiles
+uint32_t out_subblock_w = get_compile_time_arg_val(10);
+// inner column block size in tiles
+uint32_t out_subblock_num_tiles = get_compile_time_arg_val(11);
+
+uint32_t out_block_w = in1_per_core_w;
+
+// If true, this assumes data coming in RM
+constexpr bool tilize_in = get_compile_time_arg_val(12);
+
+// If true, this assumes consumer wants data RM
+constexpr bool untilize_out = get_compile_time_arg_val(13);
+
+constexpr bool spill = num_blocks > 1U;
+bool enable_reload = false;
+
+for (uint32_t block = 0U; block < num_blocks; block++) {
+  bool last_out = block == num_blocks - 1U;
+
+  if constexpr (tilize_in) {
+    tilize_activation(in0_subblock_h, in0_block_w, in0_num_subblocks);
+  }
+
+  for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) {
+    for (uint32_t in1_subblock = 0U; in1_subblock < in1_num_subblocks; in1_subblock++) {
+
+      llk_math_wait_for_dest_available<SyncHalf>();
+      if (enable_reload) {
+        llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE, false>();
+        for (uint32_t i = 0U; i < out_subblock_num_tiles; i++) {
+          llk_math_eltwise_unary_datacopy<A2D, BroadcastType::NONE, SyncHalf>(i);
+        }
+      }
+      llk_math_matmul_init<MATH_FIDELITY>(0);
+
+      int dst_index = 0;
+      for (uint32_t h = 0U; h < out_subblock_h; h++) {
+        for (uint32_t w = 0U; w < out_subblock_w; w++) {
+          for (uint32_t inner_dim = 0U; inner_dim < in0_block_w; inner_dim++) {
+            llk_math_matmul<MATH_FIDELITY>(dst_index);
+          }
+          dst_index++;
+        }
+      }
+
+      llk_math_dest_section_done<SyncHalf>();
+    }
+    if constexpr (untilize_out) {
+      if (last_out) {
+        reblock_and_untilize_output(out_subblock_h, out_block_w);
+      }
+    }
+
+  }
+  if constexpr (spill) {
+    enable_reload = true;
+  }
+}
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_pack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_pack.cpp
new file mode 100644
index 00000000000..53de202b66c
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_pack.cpp
@@ -0,0 +1,179 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "llk_pack_common.h"
+#include "llk_pack.h"
+namespace NAMESPACE
+{
+
+inline void tilize_activation(
+    uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t in0_block_num_tiles, uint32_t matmul_act_cb_id) {
+    llk_wait_for_free_tiles<false,false,false>(matmul_act_cb_id, in0_block_num_tiles);
+    for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) {
+        for (uint32_t i = 0U; i < in0_subblock_h; i++) {
+            for (uint32_t j = 0U; j < in0_block_w; j++) {
+                llk_packer_wait_for_math_done();
+                llk_pack<false, SyncHalf, false >(0, matmul_act_cb_id);
+                llk_pack_dest_section_done<SyncHalf>();
+                llk_push_tiles<false,false>(matmul_act_cb_id, 1);
+            }
+        }
+    }
+}
+
+inline void pack_row(uint32_t num_tiles_to_pack, uint32_t cb_id) {
+    /*
+        Used either for packing reblocked tiles for untilized tiles
+    */
+    llk_wait_for_free_tiles<false,false,false>(cb_id, num_tiles_to_pack);
+    for (uint32_t i = 0; i < num_tiles_to_pack; i++) {
+        llk_packer_wait_for_math_done();
+        llk_pack<false, SyncHalf, false >(0, cb_id);
+        llk_pack_dest_section_done<SyncHalf>();
+    }
+    llk_push_tiles<false,false>(cb_id, num_tiles_to_pack);
+}
+
+inline void reblock_and_untilize_output(uint32_t out_subblock_h, uint32_t out_block_w, uint32_t reblock_cb_id, uint32_t untilize_cb_id) {
+    for (uint32_t h = 0; h < out_subblock_h; h++) {
+        // Can only push row because the CB can only fit
+        // one row
+        pack_row(out_block_w, reblock_cb_id);
+        pack_row(out_block_w, untilize_cb_id);
+    }
+}
+
+inline void pack_block_and_untilize(
+    uint32_t in0_num_subblocks, uint32_t in1_num_subblocks,
+    uint32_t out_subblock_num_tiles, uint32_t out_subblock_h, uint32_t out_block_w,
+    uint32_t interm_cb_id, uint32_t reblock_cb_id) {
+
+    for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) {
+        for (uint32_t in1_subblock = 0U; in1_subblock < in1_num_subblocks; in1_subblock++) {
+            llk_packer_wait_for_math_done();
+
+            llk_wait_for_free_tiles<false,false,false>(interm_cb_id, out_subblock_num_tiles);
+            for (uint32_t i = 0U; i < out_subblock_num_tiles; i++) {
+                llk_pack<false, SyncHalf, false >(i, interm_cb_id);
+            }
+            llk_push_tiles<false,false>(interm_cb_id, out_subblock_num_tiles);
+            llk_pack_dest_section_done<SyncHalf>();
+        }
+        reblock_and_untilize_output(out_subblock_h, out_block_w, reblock_cb_id, 16);
+    }
+}
+
+inline void pack_block(uint32_t in0_num_subblocks, uint32_t in1_num_subblocks, uint32_t out_subblock_num_tiles, uint32_t cb_id) {
+
+    for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) {
+        for (uint32_t in1_subblock = 0U; in1_subblock < in1_num_subblocks; in1_subblock++) {
+            llk_packer_wait_for_math_done();
+
+            llk_wait_for_free_tiles<false,false,false>(cb_id, out_subblock_num_tiles);
+            for (uint32_t i = 0U; i < out_subblock_num_tiles; i++) {
+                llk_pack<false, SyncHalf, false >(i, cb_id);
+            }
+            llk_push_tiles<false,false>(cb_id, out_subblock_num_tiles);
+            llk_pack_dest_section_done<SyncHalf>();
+        }
+    }
+}
+
+
+void pack_main()
+{
+uint32_t in0_block_w = get_compile_time_arg_val(0);
+llk_pack_init();
+llk_setup_outputs();
+llk_pack_dest_init<SyncHalf, DstTileFaceLayout::RowMajor, false>();
+llk_init_packer_dest_offset_registers<SyncHalf,DstTileFaceLayout::RowMajor,false>();
+llk_pack_hw_configure_disaggregated<false>(16);
+// inner block size in tiles
+uint32_t in0_num_subblocks = get_compile_time_arg_val(1);
+// outer row block size (in inner row blocks)
+uint32_t in0_block_num_tiles = get_compile_time_arg_val(2);
+// out_subblock_h*in0_block_w*in0_num_subblocks;
+uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);
+uint32_t in0_subblock_h = get_compile_time_arg_val(4);
+uint32_t in1_num_subblocks = get_compile_time_arg_val(5);
+uint32_t in1_block_num_tiles = get_compile_time_arg_val(6);
+uint32_t in1_per_core_w = get_compile_time_arg_val(7);
+constexpr uint32_t num_blocks = get_compile_time_arg_val(8);
+uint32_t out_subblock_h = get_compile_time_arg_val(9);
+uint32_t out_subblock_w = get_compile_time_arg_val(10);
+uint32_t out_subblock_num_tiles = get_compile_time_arg_val(11);
+
+uint32_t out_block_w = in1_per_core_w;
+
+// If true, this assumes data coming in RM
+constexpr bool tilize_in = get_compile_time_arg_val(12);
+
+// If true, this assumes consumer wants data RM
+constexpr bool untilize_out = get_compile_time_arg_val(13);
+
+constexpr bool spill = num_blocks > 1U;
+bool enable_reload = false;
+
+// These are required depending on tilize/untilize
+uint32_t matmul_act_cb_id = 0;
+uint32_t matmul_out_intermediate_cb_id = 24;
+if constexpr (tilize_in) {
+    // If we tilize, matmul doesn't consume original input,
+    // it consumes what is produced by tilize
+    matmul_act_cb_id = 24;
+    matmul_out_intermediate_cb_id = 25; // Given 24 is no longer available, we use 25 instead
+}
+
+uint32_t reblock_cb_id = 26; // Only used if untilize is required
+uint32_t matmul_out_cb_id = 16;
+
+for (uint32_t block = 0U; block < num_blocks - 1; block++) {
+  if constexpr (tilize_in) {
+    tilize_activation(
+        in0_subblock_h,
+        in0_block_w,
+        in0_num_subblocks,
+        in0_block_num_tiles,
+        matmul_act_cb_id);
+  }
+
+  pack_block(
+        in0_num_subblocks,
+        in1_num_subblocks,
+        out_subblock_num_tiles,
+        matmul_out_intermediate_cb_id);
+}
+
+// Last block
+if constexpr (tilize_in) {
+    tilize_activation(
+        in0_subblock_h,
+        in0_block_w,
+        in0_num_subblocks,
+        in0_block_num_tiles,
+        matmul_act_cb_id);
+}
+
+if constexpr (untilize_out) {
+   pack_block_and_untilize(
+        in0_num_subblocks,
+        in1_num_subblocks,
+        out_subblock_num_tiles,
+        out_subblock_h,
+        out_block_w,
+        matmul_out_intermediate_cb_id,
+        reblock_cb_id
+    );
+} else {
+    pack_block(
+        in0_num_subblocks,
+        in1_num_subblocks,
+        out_subblock_num_tiles,
+        matmul_out_cb_id);
+}
+
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp
new file mode 100644
index 00000000000..7f6b7684c68
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/matmul_large_block_zm/zm_3m_unpack.cpp
@@ -0,0 +1,228 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "llk_unpack_common.h"
+#include "llk_unpack_tilize.h"
+#include "llk_unpack_untilize.h"
+#include "llk_unpack_A.h"
+#include "llk_unpack_AB_matmul.h"
+namespace NAMESPACE
+{
+
+inline void tilize_activation(uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks) {
+    // Tilize block code
+    llk_unpack_tilize_init(0, in0_block_w);
+    for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) {
+        for (uint32_t i = 0U; i < in0_subblock_h; i++) {
+            llk_wait_tiles(0, in0_block_w); // These "tiles" are actually not real tiles
+            llk_unpack_tilize_(0,in0_block_w);
+            llk_pop_tiles(0,in0_block_w); // Pop the original untilized inputs
+        }
+    }
+    llk_unpack_tilize_uninit();
+}
+
+
+inline __attribute__((always_inline))
+void reblock_and_untilize(
+    uint32_t num_out_subblocks_in_col,
+    uint32_t out_subblock_num_tiles,
+    uint32_t out_subblock_h,
+    uint32_t out_subblock_w,
+    uint32_t out_block_w,
+    uint32_t interm_cb_id,
+    uint32_t reblock_cb_id) {
+
+    // Wait for a row of subblocks such that the total width matches
+    // the out block width. Must wait for a whole row of subblocks to arrive
+    // before we can proceed.
+    uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles,  num_out_subblocks_in_col);
+    llk_wait_tiles(interm_cb_id, num_tiles_in_row_of_subblocks);
+
+    int within_block_index = 0;
+    for (uint32_t h = 0; h < out_subblock_h; h++) {
+        int block_offset = 0;
+
+        llk_unpack_A_init<BroadcastType::NONE, false, false>();
+        for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) {
+            for (uint32_t w = 0; w < out_subblock_w; w++) {
+                uint32_t tile_index = block_offset + within_block_index + w;
+                llk_unpack_A(interm_cb_id, tile_index);
+            }
+            block_offset += out_subblock_num_tiles;
+        }
+
+        // Since our reblock CB can only fit one row of
+        // tiles, we need to immediately untilize to
+        // consume this row
+        llk_wait_tiles(reblock_cb_id, out_block_w);
+        /*
+        for (uint32_t i = 0; i < out_block_w; i++) {
+            llk_unpack_A(reblock_cb_id, i);
+        }
+        */
+
+            llk_unpack_untilize_init(reblock_cb_id);
+            llk_unpack_untilize_<true>(reblock_cb_id, out_block_w);
+            llk_unpack_untilize_<false>(reblock_cb_id, out_block_w);
+            llk_unpack_untilize_uninit(reblock_cb_id);
+
+        llk_pop_tiles(reblock_cb_id, out_block_w);
+
+        within_block_index += out_subblock_w;
+    }
+    llk_pop_tiles(interm_cb_id, num_tiles_in_row_of_subblocks);
+}
+
+inline void unpack_for_matmul_output_row(
+    uint32_t in1_num_subblocks,
+    bool enable_reload,
+    uint32_t out_subblock_num_tiles,
+    uint32_t out_subblock_h,
+    uint32_t out_subblock_w,
+    uint32_t in0_block_w,
+    uint32_t in0_index_subblock_offset,
+    uint32_t in1_per_core_w,
+    uint32_t matmul_act_cb_id,
+    uint32_t matmul_out_intermediate_cb_id) {
+
+    uint32_t in1_index_subblock_offset = 0;
+    for (uint32_t in1_subblock = 0U; in1_subblock < in1_num_subblocks; in1_subblock++) {
+      if (enable_reload) {
+        llk_unpack_A_init<BroadcastType::NONE, false, false>();
+        llk_wait_tiles(matmul_out_intermediate_cb_id, out_subblock_num_tiles);
+        for (uint32_t i = 0U; i < out_subblock_num_tiles; i++) {
+          llk_unpack_A(matmul_out_intermediate_cb_id, i);
+        }
+        llk_pop_tiles(matmul_out_intermediate_cb_id, out_subblock_num_tiles);
+      }
+
+      llk_unpack_AB_matmul_init(0);
+      int dst_index = 0;
+      int in0_index_h_offset = 0;
+      for (uint32_t h = 0U; h < out_subblock_h; h++) {
+        for (uint32_t w = 0U; w < out_subblock_w; w++) {
+          int in1_index_inner_dim_offset = 0;
+          for (uint32_t inner_dim = 0U; inner_dim < in0_block_w; inner_dim++) {
+            int in0_index = ((in0_index_subblock_offset + in0_index_h_offset) + inner_dim);
+            int in1_index = ((in1_index_subblock_offset + in1_index_inner_dim_offset) + w);
+            llk_unpack_AB_matmul(matmul_act_cb_id, 1, in0_index, in1_index);
+            in1_index_inner_dim_offset += in1_per_core_w;
+          }
+          dst_index++;
+        }
+        in0_index_h_offset += in0_block_w;
+      }
+      in1_index_subblock_offset += out_subblock_w;
+    }
+}
+
+void unpack_main()
+{
+uint32_t in0_block_w = get_compile_time_arg_val(0);
+llk_setup_operands();
+llk_unpack_AB_matmul_init(0);
+// inner block size in tiles
+uint32_t in0_num_subblocks = get_compile_time_arg_val(1);
+// outer row block size (in inner row blocks)
+uint32_t in0_block_num_tiles = get_compile_time_arg_val(2);
+// out_subblock_h*in0_block_w*in0_num_subblocks;
+uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);
+
+uint32_t in0_subblock_h = get_compile_time_arg_val(4);
+
+// out_subblock_h*in0_block_w
+uint32_t in1_num_subblocks = get_compile_time_arg_val(5);
+// outer column block size (in inner column blocks)
+uint32_t in1_block_num_tiles = get_compile_time_arg_val(6);
+//out_subblock_w*in0_block_w* in1_num_subblocks;
+uint32_t in1_per_core_w = get_compile_time_arg_val(7);
+// out_subblock_w*in1_num_subblocks
+constexpr uint32_t num_blocks = get_compile_time_arg_val(8);
+// outer inner dim (in inner dim blocks)
+uint32_t out_subblock_h = get_compile_time_arg_val(9);
+// inner row block size in tiles
+uint32_t out_subblock_w = get_compile_time_arg_val(10);
+// inner column block size in tiles
+uint32_t out_subblock_num_tiles = get_compile_time_arg_val(11);
+
+uint32_t out_block_w = in1_per_core_w;
+
+// If true, this assumes data coming in RM
+constexpr bool tilize_in = get_compile_time_arg_val(12);
+
+// If true, this assumes consumer wants data RM
+constexpr bool untilize_out = get_compile_time_arg_val(13);
+
+
+// These are required depending on tilize/untilize
+uint32_t matmul_act_cb_id = 0;
+uint32_t matmul_out_intermediate_cb_id = 24;
+if constexpr (tilize_in) {
+    // If we tilize, matmul doesn't consume original input,
+    // it consumes what is produced by tilize
+    matmul_act_cb_id = 24;
+
+    matmul_out_intermediate_cb_id = 25; // Given 24 is no longer available, we use 25 instead
+}
+
+llk_unpack_AB_matmul_hw_configure_disaggregated(0,1,0);
+
+uint32_t reblock_cb_id = 26;
+
+constexpr bool spill = num_blocks > 1U;
+bool enable_reload = false;
+for (uint32_t block = 0U; block < num_blocks; block++) {
+  bool last_out = block == num_blocks - 1U;
+
+  if constexpr (tilize_in) {
+    tilize_activation(in0_subblock_h, in0_block_w, in0_num_subblocks);
+  } else {
+    llk_wait_tiles(matmul_act_cb_id, in0_block_num_tiles);
+  }
+
+  // Wait on weight tiles
+  llk_wait_tiles(1, in1_block_num_tiles);
+  int in0_index_subblock_offset = 0;
+  for (uint32_t in0_subblock = 0U; in0_subblock < in0_num_subblocks; in0_subblock++) {
+    unpack_for_matmul_output_row(
+        in1_num_subblocks,
+        enable_reload,
+        out_subblock_num_tiles,
+        out_subblock_h,
+        out_subblock_w,
+        in0_block_w,
+        in0_index_subblock_offset,
+        in1_per_core_w,
+        matmul_act_cb_id,
+        matmul_out_intermediate_cb_id);
+
+    if constexpr (untilize_out) {
+        if (last_out) {
+            reblock_and_untilize(
+                in1_num_subblocks,
+                out_subblock_num_tiles,
+                out_subblock_h,
+                out_subblock_w,
+                out_block_w,
+                matmul_out_intermediate_cb_id,
+                reblock_cb_id);
+        }
+    }
+
+    in0_index_subblock_offset += in0_subblock_num_tiles;
+  }
+
+  // Need to do a reblock datacopy
+  if constexpr (spill) {
+    enable_reload = true;
+  }
+
+  llk_pop_tiles(matmul_act_cb_id, in0_block_num_tiles);
+  llk_pop_tiles(1, in1_block_num_tiles);
+}
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp
new file mode 100644
index 00000000000..bdc0507c5ce
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_math.cpp
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "llk_math_common.h"
+#include "llk_math_eltwise_binary.h"
+#include "llk_math_eltwise_unary_datacopy.h"
+
+namespace NAMESPACE
+{
+
+void math_main()
+{
+uint32_t per_core_num_blocks = get_compile_time_arg_val(0);
+uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1);
+uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2);
+
+llk_math_pack_sync_init<SyncHalf>();
+for (uint32_t block = 0; block < per_core_num_blocks; block++) {
+  for (uint32_t r = 0; r < per_core_block_r_tiles; r++) {
+    // Untilize
+    llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE, false>();
+    for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+        llk_math_wait_for_dest_available<SyncHalf>();
+        llk_math_eltwise_unary_datacopy<A2D, BroadcastType::NONE, SyncHalf>(0);
+        llk_math_dest_section_done<SyncHalf>();
+    }
+
+    llk_math_eltwise_binary_init<ELWADD, NONE>();
+    for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+        llk_math_wait_for_dest_available<SyncHalf>();
+        llk_math_eltwise_binary<ELWADD, NONE, SyncHalf, MATH_FIDELITY, false>(0);
+        llk_math_dest_section_done<SyncHalf>();
+    }
+  }
+}
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_pack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_pack.cpp
new file mode 100644
index 00000000000..ef6afbc0113
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_pack.cpp
@@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "llk_pack_common.h"
+#include "llk_pack.h"
+namespace NAMESPACE
+{
+
+void pack_main()
+{
+uint32_t per_core_num_blocks = get_compile_time_arg_val(0);
+uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1);
+uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2);
+llk_pack_init();
+llk_pack_hw_configure_disaggregated<false>(16);
+llk_setup_outputs();
+llk_pack_dest_init<SyncHalf, DstTileFaceLayout::RowMajor, false>();
+
+for (uint32_t block = 0; block < per_core_num_blocks; block++) {
+    for (uint32_t r = 0; r < per_core_block_r_tiles; r++) {
+        llk_wait_for_free_tiles<false,false,false>(24, per_core_block_c_tiles);
+        for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+            llk_packer_wait_for_math_done();
+            llk_pack<false, SyncHalf, false >(0,24);
+            llk_pack_dest_section_done<SyncHalf>();
+        }
+        llk_push_tiles<false, false>(24, per_core_block_c_tiles);
+
+        llk_wait_for_free_tiles<false,false,false>(16, per_core_block_c_tiles);
+        for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+            llk_packer_wait_for_math_done();
+            llk_pack<false, SyncHalf, false >(0,16);
+            llk_pack_dest_section_done<SyncHalf>();
+        }
+        llk_push_tiles<false, false>(16, per_core_block_c_tiles);
+    }
+}
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp
new file mode 100644
index 00000000000..7f1e967ac54
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/3T/untilize_A_and_eltwise_binary/chlkc_unpack.cpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "llk_unpack_common.h"
+#include "llk_unpack_AB.h"
+#include "llk_unpack_untilize.h"
+
+namespace NAMESPACE
+{
+
+void unpack_main()
+{
+uint32_t per_core_num_blocks = get_compile_time_arg_val(0);
+uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1);
+uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2);
+
+llk_setup_operands();
+llk_unpack_AB_hw_configure_disaggregated<BroadcastType::NONE>(0,1);
+// llk_unpack_untilize_hw_configure_disaggregated(0);
+
+// llk_unpack_untilize_init(0);
+for (uint32_t block = 0U; block < per_core_num_blocks; ++block) {
+  for (uint32_t r = 0; r < per_core_block_r_tiles; r++) {
+    llk_unpack_untilize_init(0);
+    llk_wait_tiles(0, per_core_block_c_tiles);
+    llk_unpack_untilize_<true>(0, per_core_block_c_tiles);
+    llk_unpack_untilize_<false>(0, per_core_block_c_tiles);
+    llk_unpack_untilize_uninit(0);
+    llk_pop_tiles(0, per_core_block_c_tiles);
+    llk_pop_tiles(1, per_core_block_c_tiles);
+
+    llk_unpack_AB_init<BroadcastType::NONE>();
+    for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+        llk_wait_tiles(24, 1);
+        llk_wait_tiles(1, 1);
+        llk_unpack_AB(24, 1, 0, 0);
+        llk_pop_tiles(24, 1);
+        llk_pop_tiles(1, 1);
+    }
+  }
+}
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp
new file mode 100644
index 00000000000..747765489ac
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp
@@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/bcast.h"
+
+
+namespace NAMESPACE {
+void MAIN {
+    constexpr uint32_t onetile = 1;
+    uint32_t B = get_arg_val<uint32_t>(0);
+    uint32_t Ht = get_arg_val<uint32_t>(1);
+    uint32_t Wt = get_arg_val<uint32_t>(2);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+
+    for (uint32_t b = 0; b < B; b++) {
+    for (uint32_t h = 0; h < Ht; h++) {
+    for (uint32_t w = 0; w < Wt; w++) {
+        // For this bcast-h op the reader will wrap the RHS source tile around at Wt
+        // so here we just linearly read 2 parallel arrays and apply bcast op per tile
+        // (bcast_h propagates the op down the H dimension, so it can be though of as bcast to H)
+        cb_wait_front(tt::CB::c_in1, onetile);
+
+        cb_reserve_back(tt::CB::c_out0, onetile);
+
+        acquire_dst(tt::DstMode::Half);
+
+        cb_wait_front(tt::CB::c_in0, onetile);
+
+        BCAST_OP<BroadcastType::ROW>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
+        pack_tile(0, tt::CB::c_out0);
+
+        cb_pop_front(tt::CB::c_in0, onetile);
+
+        release_dst(tt::DstMode::Half);
+
+        cb_push_back(tt::CB::c_out0, onetile);
+        cb_pop_front(tt::CB::c_in1, onetile);
+    } } }
+}
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp
new file mode 100644
index 00000000000..230ee8b9c36
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/bcast.h"
+
+namespace NAMESPACE {
+void MAIN {
+    constexpr uint32_t onetile = 1;
+    uint32_t B = get_arg_val<uint32_t>(0);
+    uint32_t Ht = get_arg_val<uint32_t>(1);
+    uint32_t Wt = get_arg_val<uint32_t>(2);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+
+    #ifdef BCAST_SCALAR
+    cb_wait_front(tt::CB::c_in1, onetile);
+    #endif
+
+    for (uint32_t b = 0; b < B; b++) {
+    for (uint32_t h = 0; h < Ht; h++) {
+    for (uint32_t w = 0; w < Wt; w++) {
+        #ifndef BCAST_SCALAR
+        cb_wait_front(tt::CB::c_in1, onetile);
+        #endif
+        cb_reserve_back(tt::CB::c_out0, onetile);
+
+        acquire_dst(tt::DstMode::Half);
+
+        cb_wait_front(tt::CB::c_in0, onetile);
+
+        BCAST_OP<BroadcastType::SCALAR>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
+        pack_tile(0, tt::CB::c_out0);
+
+        cb_pop_front(tt::CB::c_in0, onetile);
+        #ifndef BCAST_SCALAR
+        cb_pop_front(tt::CB::c_in1, onetile);
+        #endif
+        release_dst(tt::DstMode::Half);
+
+        cb_push_back(tt::CB::c_out0, onetile);
+    } } }
+
+}
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp
new file mode 100644
index 00000000000..0de0e2f82c0
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/bcast.h"
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t w = 0;
+    constexpr uint32_t onetile = 1;
+    uint32_t B = get_arg_val<uint32_t>(0);
+    uint32_t Ht = get_arg_val<uint32_t>(1);
+    uint32_t Wt = get_arg_val<uint32_t>(2);
+
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+
+    for (uint32_t b = 0; b < B; b++) {
+    for (uint32_t h = 0; h < Ht; h++) {
+        cb_wait_front(tt::CB::c_in1, onetile);
+        for (uint32_t w = 0; w < Wt; w++) {
+
+            cb_reserve_back(tt::CB::c_out0, onetile);
+
+            acquire_dst(tt::DstMode::Half);
+
+            cb_wait_front(tt::CB::c_in0, onetile);
+            BCAST_OP<BroadcastType::COL>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
+            pack_tile(0, tt::CB::c_out0);
+            cb_pop_front(tt::CB::c_in0, onetile);
+
+            release_dst(tt::DstMode::Half);
+
+            cb_push_back(tt::CB::c_out0, onetile);
+
+        }
+        cb_pop_front(tt::CB::c_in1, onetile);
+    }}
+}
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp
new file mode 100644
index 00000000000..accc0b59fc2
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp
@@ -0,0 +1,10 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "compute_kernel_api/blank.h"
+
+namespace NAMESPACE {
+void MAIN {
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp
new file mode 100644
index 00000000000..6e42eb29d49
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp
@@ -0,0 +1,54 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+using std::uint32_t;
+
+// matmul C=A*B using dims MK*KN = MN (row major order)
+//
+namespace NAMESPACE {
+void MAIN {
+
+    constexpr int onetile = 1;
+
+    int dst_tile_index = 0;
+    int in0_block_tile_index = 0;
+
+    uint32_t batch = get_compile_time_arg_val(0);
+    uint32_t Mt = get_compile_time_arg_val(1);
+    uint32_t Kt = get_compile_time_arg_val(2);
+    uint32_t Nt = get_compile_time_arg_val(3);
+
+    mm_init();
+
+    // the simplest possible version of outer product blocked matmul
+    // the reader is expected to read the A's and B's tile rows and tile columns for each output tile
+    for (uint32_t nb = 0; nb < batch; nb++)
+    for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C) // output tile of C
+    for (uint32_t nt_C = 0; nt_C < Nt; ++nt_C) // output tile index of C
+    {
+        acquire_dst(tt::DstMode::Full);
+        for (uint32_t kt = 0; kt < Kt; kt++) {
+            cb_wait_front(tt::CB::c_in0, onetile);
+            cb_wait_front(tt::CB::c_in1, onetile);
+
+            matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
+
+            cb_pop_front(tt::CB::c_in0, onetile);
+            cb_pop_front(tt::CB::c_in1, onetile);
+        }
+
+        cb_reserve_back(tt::CB::c_out0, onetile);
+        pack_tile(0, tt::CB::c_out0);
+        cb_push_back(tt::CB::c_out0, onetile);
+
+        release_dst(tt::DstMode::Full);
+    }
+
+
+}
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp
new file mode 100644
index 00000000000..ec293c8c7bb
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp
@@ -0,0 +1,108 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles
+    uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks)
+    uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks;
+    uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);  // out_subblock_h*in0_block_w
+    uint32_t in1_num_subblocks = get_compile_time_arg_val(4); // outer column block size (in inner column blocks)
+    uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); //out_subblock_w*in0_block_w* in1_num_subblocks;
+    uint32_t in1_per_core_w = get_compile_time_arg_val(6); // out_subblock_w*in1_num_subblocks
+    uint32_t num_blocks = get_compile_time_arg_val(7);  // outer inner dim (in inner dim blocks)
+    uint32_t out_subblock_h = get_compile_time_arg_val(8); // inner row block size in tiles
+    uint32_t out_subblock_w = get_compile_time_arg_val(9); // inner column block size in tiles
+    uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w;
+    uint32_t batch = get_compile_time_arg_val(11); // batch dim
+
+    mm_init();
+
+    for (uint32_t b = 0; b < batch; b++){
+        bool spill = num_blocks > 1;
+        bool enable_reload = false;
+        uint32_t out_num_tiles_to_wait = out_subblock_num_tiles;
+
+        for(uint32_t block = 0; block < num_blocks; block++)
+        {
+            bool last_out = block == (num_blocks-1);
+
+            cb_wait_front(tt::CB::c_in0, in0_block_num_tiles);
+            cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+            int in0_index_subblock_offset = 0;
+            for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
+                int in1_index_subblock_offset = 0;
+                for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
+
+                    acquire_dst(tt::DstMode::Half);
+
+                    if (enable_reload) {
+                        copy_tile_to_dst_init_short();
+                        cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            copy_tile(tt::CB::c_intermed0, i, i);
+                        }
+                        cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        mm_init_short();
+                    }
+
+                    // Compute output sub-block from in0_subblock x in1_subblock
+                    int dst_index = 0;
+                    int in0_index_h_offset = 0;
+                    for (uint32_t h = 0; h < out_subblock_h; h++) {
+                        for (uint32_t w = 0; w < out_subblock_w; w++) {
+                            int in1_index_inner_dim_offset = 0;
+                            for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
+                                int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
+                                int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
+                                matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                in1_index_inner_dim_offset += in1_per_core_w;
+                            }
+                            dst_index++;
+                        }
+                        in0_index_h_offset += in0_block_w;
+                    }
+
+                    if (last_out) {
+                        // Pack out to output buffer
+                        cb_reserve_back(tt::CB::c_out0, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            pack_tile(i, tt::CB::c_out0);
+                        }
+                        cb_push_back(tt::CB::c_out0, out_subblock_num_tiles);
+                    } else {
+                        // Wait for tiles in output buffer to be written out since interm and output share memory
+                        if (block == 0) {
+                            cb_reserve_back(tt::CB::c_out0, out_num_tiles_to_wait);
+                            out_num_tiles_to_wait += out_subblock_num_tiles;
+                        }
+                        // Move partial result to interm buffer
+                        cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            pack_tile(i, tt::CB::c_intermed0);
+                        }
+                        cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                    }
+
+                    release_dst(tt::DstMode::Half);
+                    in1_index_subblock_offset += out_subblock_w;
+                }
+                in0_index_subblock_offset += in0_subblock_num_tiles;
+            }
+
+            if (spill) enable_reload = true;
+
+            cb_pop_front(tt::CB::c_in0, in0_block_num_tiles);
+            cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+
+        }
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
new file mode 100644
index 00000000000..8f61fca907f
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
@@ -0,0 +1,167 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+#ifdef FUSE_BIAS
+#include "compute_kernel_api/bcast.h"
+#endif
+
+#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles
+    uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks)
+    uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks;
+    uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);  // out_subblock_h*in0_block_w
+    uint32_t in1_num_subblocks = get_compile_time_arg_val(4); // outer column block size (in inner column blocks)
+    uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); //out_subblock_w*in0_block_w* in1_num_subblocks;
+    uint32_t in1_per_core_w = get_compile_time_arg_val(6); // out_subblock_w*in1_num_subblocks
+    uint32_t num_blocks = get_compile_time_arg_val(7);  // outer inner dim (in inner dim blocks)
+    uint32_t out_subblock_h = get_compile_time_arg_val(8); // inner row block size in tiles
+    uint32_t out_subblock_w = get_compile_time_arg_val(9); // inner column block size in tiles
+    uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w;
+    uint32_t batch = get_compile_time_arg_val(11); // batch dim
+
+    uint32_t in0_cb_id = tt::CB::c_in0;
+    uint32_t in1_cb_id = tt::CB::c_in1;
+    uint32_t out_cb_id = tt::CB::c_out0;
+    uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
+    uint32_t mm_bias_intermediate_cb_id = tt::CB::c_intermed1;
+    uint32_t bias_cb_id = tt::CB::c_in3;
+
+    #ifdef FUSE_BIAS
+        init_bcast<EltwiseBinaryType::ELWADD, BroadcastType::ROW>(mm_bias_intermediate_cb_id, bias_cb_id);
+    #endif
+
+    mm_init(in0_cb_id, in1_cb_id, out_cb_id);
+
+    for (uint32_t b = 0; b < batch; b++){
+        bool spill = num_blocks > 1;
+        bool enable_reload = false;
+        uint32_t out_num_tiles_to_wait = out_subblock_num_tiles;
+
+        for(uint32_t block = 0; block < num_blocks; block++)
+        {
+            bool last_out = block == (num_blocks-1);
+
+            cb_wait_front(in0_cb_id, in0_block_num_tiles);
+            cb_wait_front(in1_cb_id, in1_block_num_tiles);
+            int in0_index_subblock_offset = 0;
+            for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
+                int in1_index_subblock_offset = 0;
+                for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
+
+                    acquire_dst(tt::DstMode::Half);
+
+                    if (enable_reload) {
+                        // Reconfigure input
+                        copy_tile_to_dst_init_short_with_dt(mm_partials_cb_id);
+                        cb_wait_front(mm_partials_cb_id, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            copy_tile(mm_partials_cb_id, i, i);
+                        }
+                        cb_pop_front(mm_partials_cb_id, out_subblock_num_tiles);
+                        // Reconfigure srcA back
+                        mm_init_short_with_dt(mm_partials_cb_id);
+                    }
+
+                    // Compute output sub-block from in0_subblock x in1_subblock
+                    int dst_index = 0;
+                    int in0_index_h_offset = 0;
+                    for (uint32_t h = 0; h < out_subblock_h; h++) {
+                        for (uint32_t w = 0; w < out_subblock_w; w++) {
+                            int in1_index_inner_dim_offset = 0;
+                            for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
+                                int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
+                                int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
+                                matmul_tiles(in0_cb_id, in1_cb_id, in0_index, in1_index, dst_index, false /* transpose */);
+                                in1_index_inner_dim_offset += in1_per_core_w;
+                            }
+                            dst_index++;
+                        }
+                        in0_index_h_offset += in0_block_w;
+                    }
+
+                    if (last_out) {
+
+                        #ifdef FUSE_BIAS
+                            // Move matmul result to interm buffer
+                            cb_reserve_back(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
+                            for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                                pack_tile(i, mm_bias_intermediate_cb_id);
+                            }
+                            cb_push_back(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
+                            release_dst(tt::DstMode::Half);
+
+                            // Redundant wait since we know data was just pushed
+                            cb_wait_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
+                            cb_wait_front(bias_cb_id, in1_per_core_w);
+                            add_bcast_rows_init_short();
+                            // reconfigure unpacker df for src B
+                            unpack_reconfig_data_format(mm_bias_intermediate_cb_id, bias_cb_id);
+                            // reconfigure packer df for out
+                            pack_reconfig_data_format(out_cb_id);
+                            acquire_dst(tt::DstMode::Half);
+                            for (uint32_t i = 0, j = 0; j < out_subblock_h; j++) {
+                                uint32_t bcast_tile_idx = in1_index_subblock_offset;
+                                for (uint32_t k = 0; k < out_subblock_w; k++, i++) {
+                                    add_tiles_bcast_rows(mm_bias_intermediate_cb_id, bias_cb_id, i, bcast_tile_idx, i);
+                                    bcast_tile_idx++;
+                                }
+                            }
+                            cb_pop_front(mm_bias_intermediate_cb_id, out_subblock_num_tiles);
+                            // reconfigure init for matmul
+                            mm_init_short();
+                            // reconfigure unpacker df for src B
+                            unpack_reconfig_data_format(in1_cb_id, in0_cb_id);
+                        #endif
+
+                        // sfpu activation
+                       #ifdef SFPU_OP_INIT_ACTIVATION
+                             SFPU_OP_INIT_ACTIVATION
+                            for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                              SFPU_OP_FUNC_ACTIVATION
+                            }
+                        #endif
+                        // Pack out to output buffer
+                        cb_reserve_back(out_cb_id, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            pack_tile(i, out_cb_id);
+                        }
+                        cb_push_back(out_cb_id, out_subblock_num_tiles);
+                    } else {
+                        // Wait for tiles in output buffer to be written out since interm and output share memory
+                        if (block == 0) {
+                            cb_reserve_back(out_cb_id, out_num_tiles_to_wait);
+                            out_num_tiles_to_wait += out_subblock_num_tiles;
+                        }
+                        // Move partial result to interm buffer
+                        cb_reserve_back(mm_partials_cb_id, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            pack_tile(i, mm_partials_cb_id);
+                        }
+                        cb_push_back(mm_partials_cb_id, out_subblock_num_tiles);
+                    }
+
+                    release_dst(tt::DstMode::Half);
+                    in1_index_subblock_offset += out_subblock_w;
+                }
+                in0_index_subblock_offset += in0_subblock_num_tiles;
+            }
+
+            if (spill) enable_reload = true;
+
+            cb_pop_front(in0_cb_id, in0_block_num_tiles);
+            cb_pop_front(in1_cb_id, in1_block_num_tiles);
+
+        }
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp
new file mode 100644
index 00000000000..b802f2303c4
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp
@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles
+    uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks)
+    uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks;
+    uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);  // out_subblock_h*in0_block_w
+    uint32_t in1_num_subblocks = get_compile_time_arg_val(4); // outer column block size (in inner column blocks)
+    uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); //out_subblock_w*in0_block_w* in1_num_subblocks;
+    uint32_t in1_per_core_w = get_compile_time_arg_val(6); // out_subblock_w*in1_num_subblocks
+    uint32_t num_blocks = get_compile_time_arg_val(7);  // outer inner dim (in inner dim blocks)
+    uint32_t out_subblock_h = get_compile_time_arg_val(8); // inner row block size in tiles
+    uint32_t out_subblock_w = get_compile_time_arg_val(9); // inner column block size in tiles
+    uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w;
+    uint32_t batch = get_compile_time_arg_val(11); // batch dim
+
+    uint32_t in0_cb_id = tt::CB::c_in0;
+    uint32_t in1_cb_id = tt::CB::c_in1;
+    uint32_t out_cb_id = tt::CB::c_out0;
+    uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
+
+    mm_init(in0_cb_id, in1_cb_id, out_cb_id);
+
+    for (uint32_t b = 0; b < batch; b++){
+        bool spill = num_blocks > 1;
+        bool enable_reload = false;
+        uint32_t out_num_tiles_to_wait = out_subblock_num_tiles;
+
+        for(uint32_t block = 0; block < num_blocks; block++)
+        {
+            bool last_out = block == (num_blocks-1);
+
+            cb_wait_front(in0_cb_id, in0_block_num_tiles);
+            cb_wait_front(in1_cb_id, in1_block_num_tiles);
+            int in0_index_subblock_offset = 0;
+            for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
+                int in1_index_subblock_offset = 0;
+                for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
+
+                    acquire_dst(tt::DstMode::Half);
+
+                    if (enable_reload) {
+                        // Reconfigure input
+                        copy_tile_to_dst_init_short_with_dt(mm_partials_cb_id);
+                        cb_wait_front(mm_partials_cb_id, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            copy_tile(mm_partials_cb_id, i, i);
+                        }
+                        cb_pop_front(mm_partials_cb_id, out_subblock_num_tiles);
+                        // Reconfigure srcA back
+                        mm_init_short_with_dt(mm_partials_cb_id);
+                    }
+
+                    // Compute output sub-block from in0_subblock x in1_subblock
+                    int dst_index = 0;
+                    int in0_index_h_offset = 0;
+                    for (uint32_t h = 0; h < out_subblock_h; h++) {
+                        for (uint32_t w = 0; w < out_subblock_w; w++) {
+                            int in1_index_inner_dim_offset = 0;
+                            for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
+                                int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
+                                int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
+                                matmul_tiles(in0_cb_id, in1_cb_id, in0_index, in1_index, dst_index, false /* transpose */);
+                                in1_index_inner_dim_offset += in1_per_core_w;
+                            }
+                            dst_index++;
+                        }
+                        in0_index_h_offset += in0_block_w;
+                    }
+
+                    if (last_out) {
+                        // Pack out to output buffer
+                        cb_reserve_back(out_cb_id, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            pack_tile(i, out_cb_id);
+                        }
+                        cb_push_back(out_cb_id, out_subblock_num_tiles);
+                    } else {
+                        // Wait for tiles in output buffer to be written out since interm and output share memory
+                        if (block == 0) {
+                            cb_reserve_back(out_cb_id, out_num_tiles_to_wait);
+                            out_num_tiles_to_wait += out_subblock_num_tiles;
+                        }
+                        // Move partial result to interm buffer
+                        cb_reserve_back(mm_partials_cb_id, out_subblock_num_tiles);
+                        for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                            pack_tile(i, mm_partials_cb_id);
+                        }
+                        cb_push_back(mm_partials_cb_id, out_subblock_num_tiles);
+                    }
+
+                    release_dst(tt::DstMode::Half);
+                    in1_index_subblock_offset += out_subblock_w;
+                }
+                in0_index_subblock_offset += in0_subblock_num_tiles;
+            }
+
+            if (spill) enable_reload = true;
+
+            cb_pop_front(in0_cb_id, in0_block_num_tiles);
+            cb_pop_front(in1_cb_id, in1_block_num_tiles);
+
+        }
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp
new file mode 100644
index 00000000000..a91141fb45e
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp
@@ -0,0 +1,279 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tilize.h"
+#include "compute_kernel_api/untilize.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+#ifdef FUSE_BIAS
+#include "compute_kernel_api/bcast.h"
+#endif
+
+#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
+
+#define DEBUG_PRINT 0
+
+// #include "debug_macros.h"
+
+// SliceRange srt = SliceRange{.h0 = 0, .h1 = 4, .hs = 1, .w0 = 0, .w1 = 8, .ws = 1};
+// SliceRange srr = SliceRange{.h0 = 0, .h1 = 1, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1};
+// SliceRange srr1 = SliceRange{.h0 = 1, .h1 = 2, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1};
+// SliceRange src = SliceRange{.h0 = 0, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 1, .ws = 1};
+
+
+inline void tilize_in(
+    uint32_t in_cb_id,
+    uint32_t in_subblock_h,
+    uint32_t in_block_w,
+    uint32_t in_num_subblocks,
+    uint32_t out_cb_id) {
+
+    tilize_init_short(in_cb_id, in_block_w);
+    for (uint32_t in_subblock = 0; in_subblock < in_num_subblocks; ++in_subblock) {
+        for (uint32_t h = 0; h < in_subblock_h; ++h) {
+            cb_wait_front(in_cb_id, in_block_w);
+            cb_reserve_back(out_cb_id, in_block_w);;
+            tilize_block(in_cb_id, in_block_w, out_cb_id);
+            cb_push_back(out_cb_id, in_block_w);
+            cb_pop_front(in_cb_id, in_block_w);
+        }
+    }
+    tilize_uninit();
+} // tilize_in()
+
+// NOTE: Bias is not supported with the untilize option
+#ifndef FUSE_BIAS
+
+    inline void reblock_and_untilize(
+        uint32_t num_out_subblocks_in_col,
+        uint32_t out_subblock_num_tiles,
+        uint32_t out_subblock_h,
+        uint32_t out_subblock_w,
+        uint32_t out_block_w,
+        uint32_t interm_cb_id,
+        uint32_t reblock_cb_id,
+        uint32_t out_cb_id) {
+
+        uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles, num_out_subblocks_in_col);
+        cb_wait_front(interm_cb_id, num_tiles_in_row_of_subblocks);
+
+        int within_block_index = 0;
+        for (uint32_t h = 0; h < out_subblock_h; h++) {
+            int block_offset = 0;
+
+            // Reblock
+            copy_tile_to_dst_init_short();
+            cb_reserve_back(reblock_cb_id, out_block_w);
+            for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) {
+                for (uint32_t w = 0; w < out_subblock_w; w++) {
+                    uint32_t tile_index = block_offset + within_block_index + w;
+                    acquire_dst(tt::DstMode::Half);
+                    copy_tile(interm_cb_id, tile_index, 0);
+                    pack_tile(0, reblock_cb_id);
+                    release_dst(tt::DstMode::Half);
+                }
+                block_offset += out_subblock_num_tiles;
+            }
+            cb_push_back(reblock_cb_id, out_block_w);
+
+            // Untilize
+            untilize_init_short(reblock_cb_id);
+            cb_wait_front(reblock_cb_id, out_block_w);
+            cb_reserve_back(out_cb_id, out_block_w);
+            untilize_block(reblock_cb_id, out_block_w, out_cb_id);
+            cb_pop_front(reblock_cb_id, out_block_w);
+            cb_push_back(out_cb_id, out_block_w);
+            untilize_uninit(reblock_cb_id);
+
+            within_block_index += out_subblock_w;
+        }
+        cb_pop_front(interm_cb_id, num_tiles_in_row_of_subblocks);
+    } // reblock_and_untilize()
+
+#endif
+
+inline void pack_matmul_subblock(uint32_t cb_id, uint32_t out_subblock_num_tiles) {
+    cb_reserve_back(cb_id, out_subblock_num_tiles);
+    for (uint32_t i = 0; i < out_subblock_num_tiles; ++i) {
+        pack_tile(i, cb_id);
+    }
+    cb_push_back(cb_id, out_subblock_num_tiles);
+}
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles
+    uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks)
+    uint32_t in0_block_num_tiles =  get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks;
+    uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);  // out_subblock_h*in0_block_w
+    uint32_t in0_subblock_h = get_compile_time_arg_val(4);
+    uint32_t in1_num_subblocks = get_compile_time_arg_val(5); // outer column block size (in inner column blocks)
+    uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); //out_subblock_w*in0_block_w* in1_num_subblocks;
+    uint32_t in1_block_w = get_compile_time_arg_val(7); // out_subblock_w*in1_num_subblocks
+    // if these are not defined as volatile, it causes code size for TRISC2 to be too large if num_blocks > 1
+    volatile uint32_t in0_num_blocks_h = get_compile_time_arg_val(8);
+    volatile uint32_t in0_num_blocks_w = get_compile_time_arg_val(9);
+    volatile uint32_t in1_num_blocks_w = get_compile_time_arg_val(10);
+    uint32_t out_subblock_h = get_compile_time_arg_val(11); // inner row block size in tiles
+    uint32_t out_subblock_w = get_compile_time_arg_val(12); // inner column block size in tiles
+    uint32_t out_subblock_num_tiles = get_compile_time_arg_val(13); // out_subblock_h * out_subblock_w;
+    bool tilize_in0 = get_compile_time_arg_val(14);
+    bool untilize_out = get_compile_time_arg_val(15);
+
+    uint32_t out_block_w = in1_block_w;
+    bool spill = in0_num_blocks_w > 1;
+
+    // CB indices
+    constexpr uint32_t in0_cb_id                                = tt::CB::c_in0;
+    constexpr uint32_t in1_cb_id                                = tt::CB::c_in1;
+    constexpr uint32_t matmul_partials_cb                       = tt::CB::c_intermed0;
+    constexpr uint32_t tilized_in0_cb_id                        = tt::CB::c_intermed1;
+    constexpr uint32_t untilize_mode_final_matmul_partials_cb   = tt::CB::c_intermed2;
+    constexpr uint32_t untilize_mode_reblock_cb                 = tt::CB::c_intermed3;
+    constexpr uint32_t out_cb_id                                = tt::CB::c_out0;
+
+    #ifdef FUSE_BIAS
+        uint32_t bias_ntiles_w = get_compile_time_arg_val(16);
+        constexpr uint32_t bias_cb_id                           = tt::CB::c_in2;
+        constexpr uint32_t out_for_bias_cb_id                   = tt::CB::c_intermed4;
+        init_bcast<EltwiseBinaryType::ELWADD, BroadcastType::ROW>(out_for_bias_cb_id, bias_cb_id, out_cb_id);
+    #endif
+
+    mm_init(in0_cb_id, in1_cb_id, out_cb_id);
+    for(uint32_t in0_block_h_i = 0; in0_block_h_i < in0_num_blocks_h; ++in0_block_h_i) {
+        #ifdef FUSE_BIAS
+            uint32_t bias_block_offset = 0;
+        #endif
+        for(uint32_t in1_block_w_i = 0; in1_block_w_i < in1_num_blocks_w; ++in1_block_w_i) {
+            bool enable_reload = false;
+            for(uint32_t in0_block_w_i = 0; in0_block_w_i < in0_num_blocks_w; ++in0_block_w_i) {
+                bool last_out = (in0_block_w_i == in0_num_blocks_w - 1);
+                if (tilize_in0) {
+                    tilize_in(in0_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id);
+                    mm_init_short();
+                    cb_wait_front(tilized_in0_cb_id, in0_block_num_tiles);
+                } else {
+                    cb_wait_front(in0_cb_id, in0_block_num_tiles);
+                }
+                cb_wait_front(in1_cb_id, in1_block_num_tiles);
+                int in0_index_subblock_offset = 0;
+                for (uint32_t in0_subblock_i = 0; in0_subblock_i < in0_num_subblocks; ++in0_subblock_i) {
+                    int in1_index_subblock_offset = 0;
+                    for (uint32_t in1_subblock_i = 0; in1_subblock_i < in1_num_subblocks; ++in1_subblock_i) {
+                        acquire_dst(tt::DstMode::Half);
+                        if (enable_reload) {
+                            // Reconfigure input
+                            copy_tile_to_dst_init_short_with_dt(matmul_partials_cb);
+                            cb_wait_front(matmul_partials_cb, out_subblock_num_tiles);
+                            for (uint32_t i = 0; i < out_subblock_num_tiles; ++i) {
+                                copy_tile(matmul_partials_cb, i, i);
+                            }
+                            cb_pop_front(matmul_partials_cb, out_subblock_num_tiles);
+                            // Reconfigure srcA back
+                            mm_init_short_with_dt(matmul_partials_cb);
+                        } // enable_reload
+                        // Compute output sub-block from in0_subblock x in1_subblock
+                        int dst_index = 0;
+                        int in0_index_h_offset = 0;
+                        for (uint32_t h = 0; h < out_subblock_h; ++h) {
+                            for (uint32_t w = 0; w < out_subblock_w; ++w) {
+                                int in1_index_inner_dim_offset = 0;
+                                for (uint32_t inner_dim = 0; inner_dim < in0_block_w; ++inner_dim) {
+                                    matmul_tiles(tilize_in0 ? tilized_in0_cb_id : in0_cb_id,                    // in0_cb
+                                                 in1_cb_id,                                                     // in1_cb
+                                                 in0_index_subblock_offset + in0_index_h_offset + inner_dim,    // in0 tile
+                                                 in1_index_subblock_offset + in1_index_inner_dim_offset + w,    // in1 tile
+                                                 dst_index,                                                     // dst
+                                                 false);
+                                    in1_index_inner_dim_offset += in1_block_w;
+                                } // for in0_block_w
+                                ++dst_index;
+                            } // for out_subblock_w
+                            in0_index_h_offset += in0_block_w;
+                        } // for out_subblock_h
+                        #ifdef FUSE_BIAS
+                            // if bias is to be added, add it to the data in dst before packing into the out cb
+                            if (last_out) {
+                                // first move the current result from dst to interim CB
+                                pack_matmul_subblock(out_for_bias_cb_id, out_subblock_num_tiles);
+                                release_dst(tt::DstMode::Half);
+                                // reconfig unpacker df for src B
+                                // unpack_reconfig_data_format(out_for_bias_cb_id, bias_cb_id);
+                                // bcast add data from bias_cb_id
+                                cb_wait_front(bias_cb_id, bias_ntiles_w);
+                                cb_wait_front(out_for_bias_cb_id, out_subblock_num_tiles);
+                                add_bcast_rows_init_short();
+                                // reconfig packer df for out
+                                // pack_reconfig_data_format(out_cb_id);
+                                acquire_dst(tt::DstMode::Half);
+                                uint32_t i = 0;
+                                for (uint32_t h = 0; h < out_subblock_h; ++ h) {
+                                    uint32_t bcast_tile_i = bias_block_offset + in1_index_subblock_offset;
+                                    for (uint32_t w = 0; w < out_subblock_w; ++ w) {
+                                        add_tiles_bcast_rows(out_for_bias_cb_id, bias_cb_id, i, bcast_tile_i, i);
+                                        ++ bcast_tile_i;
+                                        ++ i;
+                                    }
+                                }
+                                // do not pop front bias as it may be used again for subsequent blocks
+                                cb_pop_front(out_for_bias_cb_id, out_subblock_num_tiles);
+                                // reconfig for matmul
+                                mm_init_short();
+                                // reconfig unpacker df for srcB
+                                // unpack_reconfig_data_format(in1_cb_id, in0_cb_id);
+                            }
+                        #endif
+
+                        #ifdef SFPU_OP_INIT_ACTIVATION
+                            if (last_out) {
+                                SFPU_OP_INIT_ACTIVATION
+                                for (uint32_t i = 0; i < out_subblock_num_tiles; ++ i) {
+                                    SFPU_OP_FUNC_ACTIVATION
+                                }
+                            }
+                        #endif
+
+                        auto curr_matmul_out_cb = last_out
+                                                    ? (untilize_out
+                                                        ? untilize_mode_final_matmul_partials_cb
+                                                        : out_cb_id)
+                                                    : matmul_partials_cb;
+                        pack_matmul_subblock(curr_matmul_out_cb, out_subblock_num_tiles);
+                        release_dst(tt::DstMode::Half);
+                        in1_index_subblock_offset += out_subblock_w;
+                    } // for in1_num_subblocks
+                    #ifndef FUSE_BIAS
+                        // untilizing is only supported if there is no bias
+                        if (last_out && untilize_out) {
+                            reblock_and_untilize(
+                                in1_num_subblocks,
+                                out_subblock_num_tiles,
+                                out_subblock_h,
+                                out_subblock_w,
+                                out_block_w,
+                                untilize_mode_final_matmul_partials_cb,
+                                untilize_mode_reblock_cb,
+                                out_cb_id);
+                            mm_init_short();
+                        } // last_out
+                    #endif
+                    in0_index_subblock_offset += in0_subblock_num_tiles;
+                }
+
+                if (spill) enable_reload = true;
+
+                cb_pop_front(tilize_in0 ? tilized_in0_cb_id : in0_cb_id, in0_block_num_tiles);
+                cb_pop_front(in1_cb_id, in1_block_num_tiles);
+            } // for in0_num_blocks_w
+            #ifdef FUSE_BIAS
+                bias_block_offset += in1_block_w;
+            #endif
+        } // for in1_num_blocks_w
+    } // for in0_num_blocks_h
+} // MAIN
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp
new file mode 100644
index 00000000000..cf04922122a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api/tile_move_copy.h"
+
+#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
+
+ALWI void ACQ() { acquire_dst(tt::DstMode::Half); }
+ALWI void REL() { release_dst(tt::DstMode::Half); }
+
+
+#define PRE_SCALE defined SFPU_OP_INIT_PRE_IN0_0 || defined SFPU_OP_INIT_PRE_IN1_0
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t per_core_block_cnt = get_arg_val<uint32_t>(0);
+    uint32_t per_core_block_size = get_arg_val<uint32_t>(1);
+
+    #ifdef SFPU_OP_INIT_PRE_IN0_0
+        constexpr auto cb_inp0 = tt::CB::c_intermed0;
+    #else
+        constexpr auto cb_inp0 = tt::CB::c_in0;
+    #endif
+
+    #ifdef SFPU_OP_INIT_PRE_IN1_0
+        constexpr auto cb_inp1 = tt::CB::c_intermed1;
+    #else
+        constexpr auto cb_inp1 = tt::CB::c_in1;
+    #endif
+
+    binary_op_init_common(cb_inp0, cb_inp1);
+
+    #if not PRE_SCALE
+    binary_op_specific_init<false>(ELTWISE_OP_CODE);
+    #endif
+
+    for(uint32_t block = 0; block < per_core_block_cnt; ++block) {
+
+        cb_reserve_back(tt::CB::c_out0, per_core_block_size);
+
+        #ifdef SFPU_OP_INIT_PRE_IN0_0
+        cb_wait_front(tt::CB::c_in0, per_core_block_size);
+        cb_reserve_back(cb_inp0, per_core_block_size);
+        copy_tile_init(); // need to copy from CB to DST to be able to run sfpu math
+        ACQ();
+        SFPU_OP_INIT_PRE_IN0_0
+        for(uint32_t i = 0; i < per_core_block_size; ++i)
+        {
+            copy_tile(tt::CB::c_in0, i, i); // copy from c_in[0] to DST[0]
+            SFPU_OP_FUNC_PRE_IN0_0
+            pack_tile(i, cb_inp0); // DST[0]->cb
+        }
+        REL();
+        cb_pop_front(tt::CB::c_in0, per_core_block_size);
+        cb_push_back(cb_inp0, per_core_block_size);
+        #endif
+
+        #ifdef SFPU_OP_INIT_PRE_IN1_0
+        cb_wait_front(tt::CB::c_in1, per_core_block_size);
+        cb_reserve_back(cb_inp1, per_core_block_size);
+        copy_tile_init(); // need to copy from CB to DST to be able to run sfpu math
+        ACQ();
+        SFPU_OP_INIT_PRE_IN1_0
+        for(uint32_t i = 0; i < per_core_block_size; ++i)
+        {
+            copy_tile(tt::CB::c_in1, i, i); // copy from c_in[0] to DST[0]
+            SFPU_OP_FUNC_PRE_IN1_0
+            pack_tile(i, cb_inp1); // DST[0]->cb
+        }
+        REL();
+        cb_pop_front(tt::CB::c_in1, per_core_block_size);
+        cb_push_back(cb_inp1, per_core_block_size);
+        #endif
+
+        cb_wait_front(cb_inp0, per_core_block_size);
+        cb_wait_front(cb_inp1, per_core_block_size);
+
+        #if PRE_SCALE
+        binary_op_specific_init<true>(ELTWISE_OP_CODE);
+        #endif
+        ACQ();
+        for(uint32_t i = 0; i < per_core_block_size; ++i)
+        {
+            ELTWISE_OP(cb_inp0, cb_inp1, i, i, i);
+
+            #ifdef SFPU_OP_INIT_0
+            SFPU_OP_INIT_0
+            SFPU_OP_FUNC_0
+            #endif
+
+            #ifdef SFPU_OP_CHAIN_0
+            SFPU_OP_CHAIN_0
+            #endif
+
+            pack_tile(i, tt::CB::c_out0);
+        }
+        REL();
+        cb_pop_front(cb_inp0, per_core_block_size);
+        cb_pop_front(cb_inp1, per_core_block_size);
+        cb_push_back(tt::CB::c_out0, per_core_block_size);
+    }
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp
new file mode 100644
index 00000000000..1e7c029d9a3
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/common.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/eltwise_unary/eltwise_unary.h"
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
+
+    unary_op_init_common(tt::CB::c_in0);
+    for(uint32_t b=0;b<per_core_tile_cnt;++b)
+    {
+        acquire_dst(tt::DstMode::Half);
+
+        // Pop tile after tile, copy to DST and pack
+        cb_wait_front(tt::CB::c_in0, 1);
+        cb_reserve_back(tt::CB::c_out0, 1);
+        copy_tile(tt::CB::c_in0, 0, 0);
+
+        pack_tile(0, tt::CB::c_out0);
+
+        cb_pop_front(tt::CB::c_in0, 1);
+        cb_push_back(tt::CB::c_out0, 1);
+
+        release_dst(tt::DstMode::Half);
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp
new file mode 100644
index 00000000000..a47d5e02e24
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp
@@ -0,0 +1,80 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+
+#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
+
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api.h"
+
+namespace NAMESPACE {
+
+#ifdef TRISC_MATH
+#include "llk_math_common.h"
+#include "llk_math_eltwise_unary_datacopy.h"
+
+void math_main()
+{
+    int __outer_loop_iter;
+    #ifdef ARCH_GRAYSKULL
+    llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE, false>();
+    #else
+    MATH(( llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE>(0, 0, 0) ));
+    #endif
+    llk_math_pack_sync_init<SyncHalf>();
+    constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
+    for (uint32_t b = 0; b < per_core_tile_cnt; ++b) {
+        llk_math_wait_for_dest_available<SyncHalf>();
+        llk_math_eltwise_unary_datacopy<A2D, BroadcastType::NONE, SyncHalf>(0);
+        llk_math_dest_section_done<SyncHalf>();
+    }
+}
+#endif
+
+#ifdef TRISC_PACK
+#include "llk_pack_common.h"
+#include "llk_pack.h"
+
+void pack_main()
+{
+    int __outer_loop_iter;
+    llk_pack_init();
+    llk_pack_hw_configure_disaggregated<false>(16);
+    llk_setup_outputs();
+    llk_pack_dest_init<SyncHalf, DstTileFaceLayout::RowMajor, false>();
+    constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
+    for (uint32_t b = 0; b < per_core_tile_cnt; ++b) {
+        llk_packer_wait_for_math_done();
+        llk_wait_for_free_tiles<false,false,false>(16,1);
+        llk_pack<false, SyncHalf, false >(0,16);
+        llk_push_tiles<false,false>(16,1);
+        llk_pack_dest_section_done<SyncHalf>();
+    }
+}
+#endif
+
+#ifdef TRISC_UNPACK
+void unpack_main()
+{
+    int __outer_loop_iter;
+    llk_setup_operands();
+    #ifdef ARCH_GRAYSKULL
+    llk_unpack_A_init<BroadcastType::NONE, false, false>();
+    llk_unpack_A_hw_configure_disaggregated<BroadcastType::NONE, false, false, false>(0);
+    #else
+    UNPACK(( llk_unpack_A_init<BroadcastType::NONE, false, EltwiseBinaryReuseDestType::NONE>()  ));
+    UNPACK(( llk_unpack_A_hw_configure_disaggregated<>(0) ));
+    #endif
+    constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
+    for (uint32_t b = 0; b < per_core_tile_cnt; ++b) {
+        llk_wait_tiles(0,1);
+        llk_unpack_A(0,0);
+        llk_pop_tiles(0,1);
+    }
+}
+#endif
+
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp
new file mode 100644
index 00000000000..7d0df464e7a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/common.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    constexpr uint32_t block_num_tiles = get_compile_time_arg_val(0);
+    constexpr uint32_t num_blocks = get_compile_time_arg_val(1);
+
+    for(uint32_t block = 0; block < num_blocks; ++block) {
+       acquire_dst(tt::DstMode::Half);
+
+       // Wait tiles on the input / copy to dst / pop from input
+       cb_wait_front(tt::CB::c_in0, block_num_tiles);
+       for(uint32_t t = 0; t < block_num_tiles; ++t) {
+           copy_tile(tt::CB::c_in0, t, t);
+       }
+       cb_pop_front(tt::CB::c_in0, block_num_tiles);
+
+       // Reserve space in output / pack / push to output
+       cb_reserve_back(tt::CB::c_out0, block_num_tiles);
+       for(uint32_t t = 0; t < block_num_tiles; ++t) {
+            pack_tile(t, tt::CB::c_out0);
+       }
+       cb_push_back(tt::CB::c_out0, block_num_tiles);
+
+       release_dst(tt::DstMode::Half);
+    }
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp
new file mode 100644
index 00000000000..51b21cff002
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "llk_3c.h"
+
+namespace NAMESPACE {
+void MAIN {
+    // expands to hlk_relu_config(nullptr, 1); for relu only
+
+    uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
+    uint32_t per_core_block_dim = get_compile_time_arg_val(1);
+
+    INIT_RELU
+    for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) {
+        cb_reserve_back(CB::c_out0, per_core_block_dim);
+        for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
+            acquire_dst(DstMode::Full);
+
+            // Pop tile after tile, copy to DST and pack
+            cb_wait_front(CB::c_in0, 1);
+
+            copy_tile(CB::c_in0, 0, 0);
+            // SFPU_OP expected to be defined via add_define as one of
+            // exp_tile, gelu_tile, recip_tile. etc followed by pack_tile
+            // (except for relu because the llk is fused for relu)
+            // "sfpu_gelu(0); pack_tile(0, CB::c_out0);"
+
+            SFPI_OP_AND_PACK
+            // comes from add_define in kernel config
+            // Also is epxected to include pack_tile(0, CB::c_out0); for non-relu
+            // For relu it expects the hlk_pack_relu variant
+
+            cb_pop_front(CB::c_in0, 1);
+
+            release_dst(DstMode::Full);
+        }
+        cb_push_back(CB::c_out0, per_core_block_dim);
+    }
+    DEINIT_RELU
+    // expands to hlk_relu_config(nullptr, 0); for relu only
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp
new file mode 100644
index 00000000000..e1dd04fad5d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/common.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/eltwise_unary/eltwise_unary.h"
+#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
+    uint32_t per_core_block_dim = get_compile_time_arg_val(1);
+
+    kernel_profiler::mark_time(9997);
+
+    init_sfpu(tt::CB::c_in0);
+    for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) {
+        cb_reserve_back(tt::CB::c_out0, per_core_block_dim);
+        for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
+            acquire_dst(tt::DstMode::Half);
+
+            // Pop tile after tile, copy to DST and pack
+            cb_wait_front(tt::CB::c_in0, 1);
+
+            copy_tile(tt::CB::c_in0, 0, 0);
+
+            #ifdef SFPU_OP_CHAIN_0
+            SFPU_OP_CHAIN_0
+            #endif
+
+            pack_tile(0, tt::CB::c_out0);
+
+            cb_pop_front(tt::CB::c_in0, 1);
+
+            release_dst(tt::DstMode::Half);
+        }
+        cb_push_back(tt::CB::c_out0, per_core_block_dim);
+    }
+
+    kernel_profiler::mark_time(9998);
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp
new file mode 100644
index 00000000000..9cf8fca349a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/graph_interpreter.cpp
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/common.h"
+#include "compute_kernel_api/tile_move_copy.h"
+
+
+#include "compute_kernel_api/eltwise_unary/exp.h"
+#include "compute_kernel_api/eltwise_unary/gelu.h"
+#include "compute_kernel_api/eltwise_unary/recip.h"
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
+    uint32_t num_ops = get_compile_time_arg_val(1);
+
+    // Need to pre-initialize an op_info struct and pass into get_next_op_info and modify in that func, since hlkc doesn't support funcs returning vals yet
+    tt::op_info_t op_info = {0, 0, 0, 0, 0, 0, 0};
+    graph_interpreter_init();
+
+    for (uint32_t op_idx = 0; op_idx < num_ops; op_idx++) {
+        get_next_op_info(op_info);
+
+        for (uint32_t idx = 0; idx < per_core_tile_cnt; idx++) {
+            cb_reserve_back(op_info.cb_out_id, 1);
+            acquire_dst(tt::DstMode::Half);
+            cb_wait_front(op_info.cb_in0_id, 1);
+
+
+            if (op_info.unary) {
+                copy_tile_init();
+                copy_tile(op_info.cb_in0_id, 0, 0);
+            } else {
+                cb_wait_front(op_info.cb_in1_id, 1);
+            }
+
+            if (op_info.op_code == (int)tt::OpCode::Exponential) { // 0
+                exp_tile_init();
+                exp_tile(0);
+            } else if (op_info.op_code == (int)tt::OpCode::Reciprocal) { // 1
+                recip_tile_init();
+                recip_tile(0);
+            } else if (op_info.op_code == (int)tt::OpCode::Gelu) { // 2
+                gelu_tile_init();
+                gelu_tile(0, false);
+            } else if (op_info.op_code == (int)tt::OpCode::Add) { // 3
+                add_tiles_init();
+                add_tiles(op_info.cb_in0_id, op_info.cb_in1_id, 0, 0, 0);
+            } else if (op_info.op_code == (int)tt::OpCode::Subtract) { // 4
+                sub_tiles_init();
+                sub_tiles(op_info.cb_in0_id, op_info.cb_in1_id, 0, 0, 0);
+            } else if (op_info.op_code == (int)tt::OpCode::Multiply) { // 5
+                mul_tiles_init();
+                mul_tiles(op_info.cb_in0_id, op_info.cb_in1_id, 0, 0, 0);
+            }
+
+            pack_tile(0, op_info.cb_out_id);
+
+            if (op_info.pop0) {
+                cb_pop_front(op_info.cb_in0_id, 1);  // Don't always pop, may need the input for later
+            }
+
+            if (not op_info.unary and op_info.pop1) {
+                cb_pop_front(op_info.cb_in1_id, 1);
+            }
+
+            release_dst(tt::DstMode::Half);
+            cb_push_back(op_info.cb_out_id, 1);
+        }
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/increment_runtime_arg.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/increment_runtime_arg.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/test_kernels/increment_runtime_arg.cpp
rename to tests/tt_metal/tt_metal/test_kernels/compute/increment_runtime_arg.cpp
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp
new file mode 100644
index 00000000000..77eb18beda7
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp
@@ -0,0 +1,260 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#define REDUCE_OP PoolType::SUM
+#define REDUCE_DIM ReduceDim::REDUCE_ROW
+
+#define BCAST_LLKOP EltwiseBinaryType::ELWMUL
+#define BCAST_DIM BroadcastType::COL
+
+
+#include "compute_kernel_api/reduce.h"
+#include "compute_kernel_api/bcast.h"
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api/layernorm.h"
+
+
+ALWI void ACQ() { acquire_dst(tt::DstMode::Half); }
+ALWI void REL() { release_dst(tt::DstMode::Half); }
+
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t NCHt = get_arg_val<uint32_t>(0);
+    constexpr uint32_t Wt = get_compile_time_arg_val(0);
+    constexpr uint32_t blk = get_compile_time_arg_val(1);
+    constexpr uint32_t do_gamma = get_compile_time_arg_val(2);
+    constexpr uint32_t do_beta = get_compile_time_arg_val(3);
+
+
+    #ifdef FUSE_PRE_ADD
+        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
+    #else
+        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+    #endif
+
+    constexpr uint32_t onetile = 1;
+    // reserve one tile for zeros on cb_in2
+    // TODO(AP): check that if DST is indeed zeroed by release_dst (and initially), we can use it as zeroes
+
+    // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct
+    constexpr auto cb_scaler = tt::CB::c_in2; // single tile generated by the reader
+    constexpr auto cb_eps = tt::CB::c_in3; // single tile generated by the reader
+    constexpr auto cb_xmm = tt::CB::c_intermed0; // x minus mean, this is a large buffer, see setup code in layernorm_op.cpp
+    constexpr auto cb_ex = tt::CB::c_intermed1; // E[x]
+    constexpr auto cb_ex2 = tt::CB::c_intermed2; // E[(x-E[x])^2]
+    constexpr auto cb_xmm2 = tt::CB::c_intermed3; // xmm^2
+    constexpr auto cb_ex2pe = tt::CB::c_intermed4; // E[(x-E[x])^2]+eps
+    constexpr auto cb_in = tt::CB::c_in0; // input x or a for fused pre-add (x=a+b)
+    constexpr auto cb_inb = tt::CB::c_in1; // input b for fused pre-add
+    constexpr auto cb_out = tt::CB::c_out0; // output
+    constexpr auto cb_gamma = tt::CB::c_in5;
+    constexpr auto cb_beta = tt::CB::c_in6;
+    constexpr auto cb_fusion = tt::CB::c_intermed5; // stream gamma/beta
+    constexpr auto scaler0 = 0;
+    #ifdef FUSE_PRE_ADD
+    constexpr auto cb_x = tt::CB::c_intermed6;
+    #else
+    constexpr auto cb_x = tt::CB::c_in0;
+    #endif
+
+    cb_wait_front(cb_scaler, 1); // comes from the reader
+    cb_wait_front(cb_eps, 1); // comes from the reader
+
+
+    constexpr int cb_im_or_out = (do_gamma|do_beta) ? cb_fusion : tt::CB::c_out0;
+
+
+    for (uint32_t ncht = 0; ncht < NCHt; ncht++) {
+
+        constexpr int onetile = 1;
+        constexpr int dst0 = 0;
+
+        /*
+         * X + Y
+         */
+        #ifdef FUSE_PRE_ADD
+            add_tiles_init();
+            for (uint32_t wt = 0; wt < Wt; wt += blk) {
+                ACQ();
+                        //UNPACK(( { DPRINT  << "Waiting on cb_x" << ENDL(); } ));
+                cb_wait_front(cb_in, blk);
+                        //UNPACK(( { DPRINT  << "Waiting on cb_inb" << ENDL(); } ));
+                cb_wait_front(cb_inb, blk);
+                        //UNPACK(( { DPRINT  << "Done Waiting on cb_inb" << ENDL(); } ));
+                cb_reserve_back(cb_x, blk);
+                for (uint32_t j = 0; j < blk; j++) {
+                    add_tiles(cb_in, cb_inb, j, j, j);
+                    pack_tile(j, cb_x);
+                }
+                REL();
+                cb_push_back(cb_x, blk); // push the sum into the same buffer
+                cb_pop_front(cb_in, blk);
+                cb_pop_front(cb_inb, blk);
+            }
+            // by the end of this loop we should end up with Wt tiles in cb_x
+        #endif
+
+        /*
+         * E[x]
+         * means = tensor.reduce(x, RSUM, RW, 1.0/W) # -> NCH1
+         */
+        ACQ();
+        cb_reserve_back(cb_ex, 1*onetile);
+        reduce_init_delta<false>(REDUCE_OP, REDUCE_DIM);
+        for (uint32_t wt = 0; wt < Wt; wt += blk) {
+            cb_wait_front(cb_x, wt+blk);
+            for (uint32_t j = 0; j < blk; j++) {
+                reduce_tile(REDUCE_OP, REDUCE_DIM, cb_x, cb_scaler, wt+j, scaler0, dst0);
+            }
+            // we don't pop cb_x until we compute Ex
+        }
+        pack_tile(dst0, cb_ex);
+        reduce_revert_delta();
+        REL();
+
+        cb_push_back(cb_ex, 1);
+
+        /*
+         * x - E[x]
+         * compute xmm=x-mean. Reuse cb_x since we didn't pop anything from it
+         */
+        cb_wait_front(cb_ex, 1); // should have 1 tile
+        cb_reserve_back(cb_xmm, Wt);
+        sub_bcast_cols_init_short();
+        for (uint32_t wt = 0; wt < Wt; wt += blk) {
+            ACQ();
+            for (uint32_t wtr = 0; wtr<blk; wtr++) {
+                sub_tiles_bcast_cols(cb_x, cb_ex, wt+wtr, 0, wtr); // tile *= 1/(sum(exp(x)))
+                pack_tile(wtr, cb_xmm);
+            }
+            cb_push_back(cb_xmm, blk);
+            REL();
+        }
+        cb_pop_front(cb_ex, 1);
+        cb_pop_front(cb_x, Wt);
+
+        /* (x - E[x])^2
+         * compute temp = xmm*xmm = (x-E[x])^2
+         */
+        mul_tiles_init();
+        for (uint32_t wt = 0; wt < Wt; wt += blk) {
+            cb_wait_front(cb_xmm, wt+blk); // cumulative wait
+            cb_reserve_back(cb_xmm2, blk); // can probably use less space for this if we block
+            ACQ();
+            for (uint32_t wtr = 0; wtr<blk; wtr++) {
+                mul_tiles(cb_xmm, cb_xmm, wt+wtr, wt+wtr, wtr);
+                //mul_tiles(cb_xmm, cb_col1, wt+wtr, wt+wtr, wtr);
+                pack_tile(wtr, cb_xmm2);
+            }
+            cb_push_back(cb_xmm2, blk);
+            REL();
+        }
+
+        /* Var(x)
+         * compute E[(x-E[x])^2]
+         * IIRC E[x^2] - E[x]^2 trick was unstable
+         * TODO(AP): can save space here by reusing CB
+         */
+        cb_reserve_back(cb_ex2, 1);
+        reduce_init_delta<false>(REDUCE_OP, REDUCE_DIM);
+        ACQ();
+        cb_wait_front(cb_xmm2, Wt);
+        //cb_wait_front(cb_xmm, Wt);
+        for (uint32_t wt = 0; wt < Wt; wt += blk) {
+            // reduce
+            for (uint32_t wtr = 0; wtr<blk; wtr++)
+                reduce_tile(REDUCE_OP, REDUCE_DIM, cb_xmm2, cb_scaler, wt+wtr, scaler0, dst0);
+                //reduce_tile(REDUCE_OP, REDUCE_DIM, cb_xmm, cb_scaler, wt+wtr, scaler0, dst0);
+        }
+        cb_pop_front(cb_xmm2, Wt);
+        pack_tile(dst0, cb_ex2);
+        reduce_revert_delta();
+        REL();
+
+        cb_push_back(cb_ex2, 1);
+        cb_wait_front(cb_ex2, 1);
+
+        /* Var(x) + eps
+         * add epsilon E[(x-E[x])^2]+eps
+         */
+        ACQ();
+        add_tiles_init();
+        add_tiles(cb_ex2, cb_eps, 0, 0, dst0);
+
+        cb_reserve_back(cb_ex2pe, 1); // 1
+        sqrt_tile_init();
+        sqrt_tile(dst0);
+        recip_tile_init();
+        recip_tile(dst0);
+        pack_tile(dst0, cb_ex2pe);
+        cb_push_back(cb_ex2pe, 1);
+        REL();
+        cb_pop_front(cb_ex2, 1);
+
+        /* ln(x) * gamma + beta (gamma and beta are optional)
+         * now xmm = (x-E[x])
+         * we have 1.0/sqrt( E[(x-E[x])^2] + eps) in cb_ex2pe
+         * just need to bcast_mul xmm with cb_ex2pe
+         */
+        cb_wait_front(cb_ex2pe, 1);
+        for (uint32_t wt = 0; wt < Wt; wt += blk) {
+                        //if (ht == 1) UNPACK(( DPRINT << "wt_2=" << wt << " " ));
+                        //if (ht == 1) UNPACK(( DPRINT << "rem_2=" << rem << ENDL() ));
+            cb_reserve_back(cb_im_or_out, blk);
+
+            ACQ();
+            mul_bcast_cols_init_short();
+            for (uint32_t wtr = 0; wtr < blk; wtr++) {
+                // cb_xmm[wt+wtr] since we pop Wt from cb_xmm after the entire loop
+                mul_tiles_bcast_cols(cb_xmm, cb_ex2pe, wt+wtr, 0, wtr); // tile *= 1/(sum(exp(x)))
+                pack_tile(wtr, cb_im_or_out); // pack either to intermediate (cb_fusion or out0)
+            }
+            cb_push_back(cb_im_or_out, blk); // if no gamma/beta are provided, this will be passed on to the writer
+            REL();
+
+            if (do_gamma) {
+                ACQ();
+                uint32_t cb_outg = do_beta ? cb_fusion : tt::CB::c_out0;
+                mul_bcast_rows_init_short();
+                cb_reserve_back(cb_outg, blk);
+                cb_wait_front(cb_gamma, wt+blk); // we don't pop, TODO: only wait on first ht
+                cb_wait_front(cb_fusion, blk);
+                for (uint32_t wtr = 0; wtr < blk; wtr++) {
+                    mul_tiles_bcast_rows(cb_fusion, cb_gamma, wtr, wt+wtr, wtr); // tile *= 1/(sum(exp(x)))
+                    pack_tile(wtr, cb_outg); // pack either to intermediate (cb_fusion or out0)
+                }
+                cb_pop_front(cb_fusion, blk);
+                // we don't pop gamma
+                cb_push_back(cb_outg, blk);
+                // We don't pop gamma since it's 1,1,1,Wt and we reuse it for all NCHt
+                REL();
+            }
+            if (do_beta) {
+                ACQ();
+                add_bcast_rows_init_short();
+                cb_reserve_back(tt::CB::c_out0, blk);
+                cb_wait_front(cb_beta, wt+blk); // TODO: optimization - only wait on first ht
+                cb_wait_front(cb_fusion, blk);
+                for (uint32_t wtr = 0; wtr < blk; wtr++) {
+                    add_tiles_bcast_rows(cb_fusion, cb_beta, wtr, wt+wtr, wtr); // tile *= 1/(sum(exp(x)))
+                    pack_tile(wtr, tt::CB::c_out0); // pack either to intermediate (cb_fusion or out0)
+                }
+                cb_pop_front(cb_fusion, blk);
+                // We don't pop beta since it's 1,1,1,Wt and we reuse it for all NCHt
+                cb_push_back(tt::CB::c_out0, blk);
+                REL();
+            }
+        }
+        cb_pop_front(cb_ex2pe, 1);
+        cb_pop_front(cb_xmm, Wt);
+
+    } // NCHt loop
+    //cb_pop_front(cb_scaler, 1); // optional for correctness
+    //cb_pop_front(cb_eps, 1); // optional for correctness
+    //cb_pop_front(cb_col1, 1); // optional for correctness
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp
new file mode 100644
index 00000000000..cc2a5642bae
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t block_tile_dim = get_compile_time_arg_val(0);
+    uint32_t dst_tile_rows = get_compile_time_arg_val(1);
+    uint32_t dst_tile_cols = get_compile_time_arg_val(2);
+    uint32_t block_cnt = get_compile_time_arg_val(3);
+    uint32_t in0_block_tile_cnt = get_compile_time_arg_val(4);
+    uint32_t in1_block_tile_cnt = get_compile_time_arg_val(5);
+    uint32_t out_block_tile_cnt = get_compile_time_arg_val(6);
+
+    mm_init();
+
+    acquire_dst(tt::DstMode::Full);
+    for(uint32_t b=0;b<block_cnt;++b)
+    {
+        cb_wait_front(tt::CB::c_in0, in0_block_tile_cnt);
+        cb_wait_front(tt::CB::c_in1, in1_block_tile_cnt);
+        int dst_tile_index = 0;
+        int in0_block_tile_index = 0;
+        for(uint32_t r=0;r<dst_tile_rows;++r)
+        {
+            for(uint32_t c=0;c<dst_tile_cols;++c)
+            {
+                int in1_block_tile_index = 0;
+                for(uint32_t i=0;i<block_tile_dim;++i)
+                {
+                    matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_block_tile_index+i, in1_block_tile_index+c, dst_tile_index, false);
+                    in1_block_tile_index += dst_tile_cols;
+                }
+                dst_tile_index++;
+            }
+            in0_block_tile_index += block_tile_dim;
+        }
+        cb_pop_front(tt::CB::c_in0, in0_block_tile_cnt);
+        cb_pop_front(tt::CB::c_in1, in1_block_tile_cnt);
+    }
+
+    // Pack out
+    cb_reserve_back(tt::CB::c_out0, out_block_tile_cnt);
+    for(uint32_t i=0 ; i<out_block_tile_cnt;++i)
+    {
+        pack_tile(i, tt::CB::c_out0);
+    }
+
+    cb_push_back(tt::CB::c_out0, out_block_tile_cnt);
+
+    release_dst(tt::DstMode::Full);
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp
new file mode 100644
index 00000000000..651955d6944
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp
@@ -0,0 +1,237 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tilize.h"
+#include "compute_kernel_api/untilize.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+
+
+inline void tilize_activation(uint32_t in0_cb, uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t out_cb)
+{
+    tilize_init_short(in0_cb, in0_block_w);
+
+    for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
+        for (uint32_t h = 0; h < in0_subblock_h; h++) {
+            cb_wait_front(in0_cb, in0_block_w);
+            cb_reserve_back(out_cb, in0_block_w);
+            tilize_block(in0_cb, in0_block_w, out_cb);
+            cb_push_back(out_cb, in0_block_w);
+            cb_pop_front(in0_cb, in0_block_w);
+        }
+    }
+
+    tilize_uninit();
+
+}
+
+inline void reblock_and_untilize(
+    uint32_t num_out_subblocks_in_col,
+    uint32_t out_subblock_num_tiles,
+    uint32_t out_subblock_h,
+    uint32_t out_subblock_w,
+    uint32_t out_block_w,
+    uint32_t interm_cb_id,
+    uint32_t reblock_cb_id,
+    uint32_t out_cb_id)
+{
+    uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles, num_out_subblocks_in_col);
+    cb_wait_front(interm_cb_id, num_tiles_in_row_of_subblocks);
+
+    int within_block_index = 0;
+    for (uint32_t h = 0; h < out_subblock_h; h++) {
+        int block_offset = 0;
+
+        // Reblock
+        copy_tile_to_dst_init_short();
+        cb_reserve_back(reblock_cb_id, out_block_w);
+        for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) {
+            for (uint32_t w = 0; w < out_subblock_w; w++) {
+                uint32_t tile_index = block_offset + within_block_index + w;
+                acquire_dst(tt::DstMode::Half);
+                copy_tile(interm_cb_id, tile_index, 0);
+                pack_tile(0, reblock_cb_id);
+                release_dst(tt::DstMode::Half);
+            }
+            block_offset += out_subblock_num_tiles;
+        }
+        cb_push_back(reblock_cb_id, out_block_w);
+
+        // Untilize
+        untilize_init_short(reblock_cb_id);
+        cb_wait_front(reblock_cb_id, out_block_w);
+        cb_reserve_back(out_cb_id, out_block_w);
+        untilize_block(reblock_cb_id, out_block_w, out_cb_id);
+        cb_pop_front(reblock_cb_id, out_block_w);
+        cb_push_back(out_cb_id, out_block_w);
+        untilize_uninit(reblock_cb_id);
+
+        within_block_index += out_subblock_w;
+    }
+    cb_pop_front(interm_cb_id, num_tiles_in_row_of_subblocks);
+}
+
+inline void pack_matmul_subblock(uint32_t cb_id, uint32_t out_subblock_num_tiles) {
+    cb_reserve_back(cb_id, out_subblock_num_tiles);
+    for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+        pack_tile(i, cb_id);
+    }
+    cb_push_back(cb_id, out_subblock_num_tiles);
+}
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles
+    uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks)
+    uint32_t in0_block_num_tiles =  get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks;
+    uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);  // out_subblock_h*in0_block_w
+    uint32_t in0_subblock_h = get_compile_time_arg_val(4);
+    uint32_t in1_num_subblocks = get_compile_time_arg_val(5); // outer column block size (in inner column blocks)
+    uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); //out_subblock_w*in0_block_w* in1_num_subblocks;
+    uint32_t in1_per_core_w = get_compile_time_arg_val(7); // out_subblock_w*in1_num_subblocks
+
+    // If I don't make this volatile, causes code size for TRISC2 to be too large if num_blocks > 1
+    volatile uint32_t num_blocks = get_compile_time_arg_val(8);  // outer inner dim (in inner dim blocks)
+
+    uint32_t out_subblock_h = get_compile_time_arg_val(9); // inner row block size in tiles
+    uint32_t out_subblock_w = get_compile_time_arg_val(10); // inner column block size in tiles
+    uint32_t out_subblock_num_tiles = get_compile_time_arg_val(11); // out_subblock_h * out_subblock_w;
+
+    uint32_t out_block_w = in1_per_core_w;
+
+    // If true, this assumes data coming in RM
+    bool tilize_in = get_compile_time_arg_val(12);
+
+    // If true, this assumes consumer wants data RM
+    bool untilize_out = get_compile_time_arg_val(13);
+
+    bool spill = num_blocks > 1;
+
+    bool enable_reload = false;
+
+    // CB mapping of in0, union of all possible variants (with
+    // and without fusing combinations of tilize/untilize)
+    // in0:
+    //   input 0
+    // in1:
+    //   input 1
+    // interm0:
+    //   If under tilized mode, this is CB in which we write the tilized
+    //   input 0
+    // interm1:
+    //   intermediate CB we write to so that we store partial matmul results
+    // interm2:
+    //   if under untilize mode, this is the CB we write to so that we store
+    //   the final matmul result
+    // interm3:
+    //   if under untilize mode, this is the CB we write to so that we can
+    //   reblock the output
+    uint32_t in0_cb                                   = tt::CB::c_in0;
+    uint32_t tilize_mode_tilized_in0_cb               = tt::CB::c_intermed0;
+    uint32_t matmul_partials_cb                       = tt::CB::c_intermed1;
+    uint32_t untilize_mode_final_matmul_partials_cb   = tt::CB::c_intermed2;
+    uint32_t untilize_mode_reblock_cb                 = tt::CB::c_intermed3;
+    uint32_t out0_cb                                  = tt::CB::c_out0;
+
+    mm_init();
+    for(uint32_t block = 0; block < num_blocks; block++)
+    {
+        bool last_out = block == (num_blocks-1);
+        if  (tilize_in) {
+            tilize_activation(in0_cb, in0_subblock_h, in0_block_w, in0_num_subblocks, tilize_mode_tilized_in0_cb);
+            mm_init_short();
+            cb_wait_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles);
+        } else {
+            cb_wait_front(in0_cb, in0_block_num_tiles);
+        }
+
+        cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+        int in0_index_subblock_offset = 0;
+        for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
+            int in1_index_subblock_offset = 0;
+            for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
+
+                acquire_dst(tt::DstMode::Half);
+
+                if (enable_reload) {
+                    copy_tile_to_dst_init_short();
+                    cb_wait_front(matmul_partials_cb, out_subblock_num_tiles);
+                    for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                        copy_tile(matmul_partials_cb, i, i);
+                    }
+                    cb_pop_front(matmul_partials_cb, out_subblock_num_tiles);
+                    mm_init_short();
+                }
+
+                // Compute output sub-block from in0_subblock x in1_subblock
+                int dst_index = 0;
+                int in0_index_h_offset = 0;
+                for (uint32_t h = 0; h < out_subblock_h; h++) {
+                    for (uint32_t w = 0; w < out_subblock_w; w++) {
+                        int in1_index_inner_dim_offset = 0;
+                        for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
+                            int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
+                            int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
+                            if  (tilize_in) {
+                                matmul_tiles(tilize_mode_tilized_in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                            } else {
+                                matmul_tiles(in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                            }
+                            in1_index_inner_dim_offset += in1_per_core_w;
+                        }
+                        dst_index++;
+                    }
+                    in0_index_h_offset += in0_block_w;
+                }
+
+                if (last_out) {
+                    if  (not untilize_out) {
+                        pack_matmul_subblock(out0_cb, out_subblock_num_tiles);
+                    } else {
+                        pack_matmul_subblock(untilize_mode_final_matmul_partials_cb, out_subblock_num_tiles);
+                    }
+                } else {
+                    pack_matmul_subblock(matmul_partials_cb, out_subblock_num_tiles);
+                }
+
+                release_dst(tt::DstMode::Half);
+
+                in1_index_subblock_offset += out_subblock_w;
+            }
+
+            if (untilize_out) {
+                if (last_out) {
+                    reblock_and_untilize(
+                        in1_num_subblocks,
+                        out_subblock_num_tiles,
+                        out_subblock_h,
+                        out_subblock_w,
+                        out_block_w,
+                        untilize_mode_final_matmul_partials_cb,
+                        untilize_mode_reblock_cb,
+                        out0_cb
+                    );
+                    mm_init_short();
+                }
+            }
+
+            in0_index_subblock_offset += in0_subblock_num_tiles;
+        }
+
+        if  (spill) enable_reload = true;
+
+        if  (tilize_in) {
+            cb_pop_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles);
+        } else {
+            cb_pop_front(in0_cb, in0_block_num_tiles);
+        }
+        cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+    }
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp
new file mode 100644
index 00000000000..1158d26f61e
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp
@@ -0,0 +1,245 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tilize_untilize.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+
+inline void tilize_activation(uint32_t in0_cb, uint32_t in0_subblock_h, uint32_t in0_block_w, uint32_t in0_num_subblocks, uint32_t out_cb)
+{
+    tilize_init_short(in0_cb, in0_block_w);
+
+    for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
+        for (uint32_t h = 0; h < in0_subblock_h; h++) {
+            cb_wait_front(in0_cb, in0_block_w);
+            cb_reserve_back(out_cb, in0_block_w);
+            tilize_block(in0_cb, in0_block_w, out_cb);
+            cb_push_back(out_cb, in0_block_w);
+            cb_pop_front(in0_cb, in0_block_w);
+        }
+    }
+
+    tilize_uninit();
+
+}
+
+inline void reblock_and_untilize(
+    uint32_t num_out_subblocks_in_col,
+    uint32_t out_subblock_num_tiles,
+    uint32_t out_subblock_h,
+    uint32_t out_subblock_w,
+    uint32_t out_block_w,
+    uint32_t interm_cb_id,
+    uint32_t reblock_cb_id,
+    uint32_t out_cb_id)
+{
+    uint32_t num_tiles_in_row_of_subblocks = mulsi3(out_subblock_num_tiles, num_out_subblocks_in_col);
+    cb_wait_front(interm_cb_id, num_tiles_in_row_of_subblocks);
+
+    int within_block_index = 0;
+    for (uint32_t h = 0; h < out_subblock_h; h++) {
+        int block_offset = 0;
+
+        // Reblock
+        copy_tile_to_dst_init_short();
+        cb_reserve_back(reblock_cb_id, out_block_w);
+        for (uint32_t n = 0; n < num_out_subblocks_in_col; n++) {
+            for (uint32_t w = 0; w < out_subblock_w; w++) {
+                uint32_t tile_index = block_offset + within_block_index + w;
+                acquire_dst(tt::DstMode::Half);
+                copy_tile(interm_cb_id, tile_index, 0);
+                pack_tile(0, reblock_cb_id);
+                release_dst(tt::DstMode::Half);
+            }
+            block_offset += out_subblock_num_tiles;
+        }
+        cb_push_back(reblock_cb_id, out_block_w);
+
+        // Untilize
+        untilize_init_short(reblock_cb_id);
+        cb_wait_front(reblock_cb_id, out_block_w);
+        cb_reserve_back(out_cb_id, out_block_w);
+        untilize_block(reblock_cb_id, out_block_w, out_cb_id);
+        cb_pop_front(reblock_cb_id, out_block_w);
+        cb_push_back(out_cb_id, out_block_w);
+        untilize_uninit(reblock_cb_id);
+
+        within_block_index += out_subblock_w;
+    }
+    cb_pop_front(interm_cb_id, num_tiles_in_row_of_subblocks);
+}
+
+inline void pack_matmul_subblock(uint32_t cb_id, uint32_t out_subblock_num_tiles) {
+    cb_reserve_back(cb_id, out_subblock_num_tiles);
+    for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+        pack_tile(i, cb_id);
+    }
+    cb_push_back(cb_id, out_subblock_num_tiles);
+}
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles
+    uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks)
+    uint32_t in0_block_num_tiles =  get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks;
+    uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);  // out_subblock_h*in0_block_w
+    uint32_t in0_subblock_h = get_compile_time_arg_val(4);
+    uint32_t in1_num_subblocks = get_compile_time_arg_val(5); // outer column block size (in inner column blocks)
+    uint32_t in1_block_num_tiles = get_compile_time_arg_val(6); //out_subblock_w*in0_block_w* in1_num_subblocks;
+    uint32_t in1_per_core_w = get_compile_time_arg_val(7); // out_subblock_w*in1_num_subblocks
+    // If I don't make this volatile, causes code size for TRISC2 to be too large if num_blocks > 1
+    volatile uint32_t num_blocks_in0_h = get_compile_time_arg_val(8);  // outer inner dim (in inner dim blocks)
+    volatile uint32_t num_blocks_in0_w = get_compile_time_arg_val(9);  // outer inner dim (in inner dim blocks)
+    volatile uint32_t num_blocks_in1_w = get_compile_time_arg_val(10);
+
+    uint32_t out_subblock_h = get_compile_time_arg_val(11); // inner row block size in tiles
+    uint32_t out_subblock_w = get_compile_time_arg_val(12); // inner column block size in tiles
+    uint32_t out_subblock_num_tiles = get_compile_time_arg_val(13); // out_subblock_h * out_subblock_w;
+
+    uint32_t out_block_w = in1_per_core_w;
+
+    // If true, this assumes data coming in RM
+    bool tilize_in = get_compile_time_arg_val(13);
+
+    // If true, this assumes consumer wants data RM
+    bool untilize_out = get_compile_time_arg_val(14);
+
+    bool spill = num_blocks_in0_w > 1;
+
+    bool enable_reload = false;
+
+    // CB mapping of in0, union of all possible variants (with
+    // and without fusing combinations of tilize/untilize)
+    // in0:
+    //   input 0
+    // in1:
+    //   input 1
+    // interm0:
+    //   If under tilized mode, this is CB in which we write the tilized
+    //   input 0
+    // interm1:
+    //   intermediate CB we write to so that we store partial matmul results
+    // interm2:
+    //   if under untilize mode, this is the CB we write to so that we store
+    //   the final matmul result
+    // interm3:
+    //   if under untilize mode, this is the CB we write to so that we can
+    //   reblock the output
+    uint32_t in0_cb                                   = tt::CB::c_in0;
+    uint32_t tilize_mode_tilized_in0_cb               = tt::CB::c_intermed0;
+    uint32_t matmul_partials_cb                       = tt::CB::c_intermed1;
+    uint32_t untilize_mode_final_matmul_partials_cb   = tt::CB::c_intermed2;
+    uint32_t untilize_mode_reblock_cb                 = tt::CB::c_intermed3;
+    uint32_t out0_cb                                  = tt::CB::c_out0;
+    mm_init();
+    for(uint32_t block_in0_h = 0; block_in0_h < num_blocks_in0_h; block_in0_h++) {
+        for(uint32_t block_in1_w = 0; block_in1_w < num_blocks_in1_w; block_in1_w++) {
+            enable_reload = false;
+            //DPRINT << 'B' << ENDL();
+            for(uint32_t block_in0_w = 0; block_in0_w < num_blocks_in0_w; block_in0_w++)
+            {
+
+                bool last_out = block_in0_w == (num_blocks_in0_w-1);
+                if  (tilize_in) {
+                    tilize_activation(in0_cb, in0_subblock_h, in0_block_w, in0_num_subblocks, tilize_mode_tilized_in0_cb);
+                    mm_init_short();
+                    cb_wait_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles);
+
+                } else {
+                    cb_wait_front(in0_cb, in0_block_num_tiles);
+                }
+
+                cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+
+                int in0_index_subblock_offset = 0;
+                for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
+                    int in1_index_subblock_offset = 0;
+                    for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
+
+                        acquire_dst(tt::DstMode::Half);
+
+                        if (enable_reload) {
+                            copy_tile_to_dst_init_short();
+                            cb_wait_front(matmul_partials_cb, out_subblock_num_tiles);
+                            for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                                copy_tile(matmul_partials_cb, i, i);
+                            }
+                            cb_pop_front(matmul_partials_cb, out_subblock_num_tiles);
+                            mm_init_short();
+                        }
+
+                        // Compute output sub-block from in0_subblock x in1_subblock
+                        int dst_index = 0;
+                        int in0_index_h_offset = 0;
+                        for (uint32_t h = 0; h < out_subblock_h; h++) {
+                            for (uint32_t w = 0; w < out_subblock_w; w++) {
+                                int in1_index_inner_dim_offset = 0;
+                                for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
+                                    int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
+                                    int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
+                                    if  (tilize_in) {
+                                        matmul_tiles(tilize_mode_tilized_in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                    } else {
+                                        matmul_tiles(in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                    }
+                                    in1_index_inner_dim_offset += in1_per_core_w;
+                                }
+                                dst_index++;
+                            }
+                            in0_index_h_offset += in0_block_w;
+                        }
+                        if (last_out) {
+                            if  (not untilize_out) {
+                                pack_matmul_subblock(out0_cb, out_subblock_num_tiles);
+                            } else {
+                                pack_matmul_subblock(untilize_mode_final_matmul_partials_cb, out_subblock_num_tiles);
+                            }
+                        } else {
+                            pack_matmul_subblock(matmul_partials_cb, out_subblock_num_tiles);
+                        }
+                        release_dst(tt::DstMode::Half);
+
+                        in1_index_subblock_offset += out_subblock_w;
+                    }
+
+                    if (untilize_out) {
+                        if (last_out) {
+                            reblock_and_untilize(
+                                in1_num_subblocks,
+                                out_subblock_num_tiles,
+                                out_subblock_h,
+                                out_subblock_w,
+                                out_block_w,
+                                untilize_mode_final_matmul_partials_cb,
+                                untilize_mode_reblock_cb,
+                                out0_cb
+                            );
+                            mm_init_short();
+                        }
+                    }
+
+
+                    in0_index_subblock_offset += in0_subblock_num_tiles;
+                }
+
+                if  (spill) enable_reload = true;
+
+                if  (tilize_in) {
+                    cb_pop_front(tilize_mode_tilized_in0_cb, in0_block_num_tiles);
+                } else {
+                    cb_pop_front(in0_cb, in0_block_num_tiles);
+                }
+                cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+            }
+        }
+
+    }
+
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp
new file mode 100644
index 00000000000..c1fd470c266
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp
@@ -0,0 +1,102 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+
+// #include "tools/profiler/kernel_profiler.hpp"
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t in0_block_w = get_compile_time_arg_val(0); // inner block size in tiles
+    uint32_t in0_num_subblocks = get_compile_time_arg_val(1); // outer row block size (in inner row blocks)
+    uint32_t in0_block_num_tiles = get_compile_time_arg_val(2); // out_subblock_h*in0_block_w*in0_num_subblocks;
+    uint32_t in0_subblock_num_tiles = get_compile_time_arg_val(3);  // out_subblock_h*in0_block_w
+    uint32_t in1_num_subblocks = get_compile_time_arg_val(4); // outer column block size (in inner column blocks)
+    uint32_t in1_block_num_tiles = get_compile_time_arg_val(5); //out_subblock_w*in0_block_w* in1_num_subblocks;
+    uint32_t in1_per_core_w = get_compile_time_arg_val(6); // out_subblock_w*in1_num_subblocks
+    uint32_t num_blocks = get_compile_time_arg_val(7);  // outer inner dim (in inner dim blocks)
+    uint32_t out_subblock_h = get_compile_time_arg_val(8); // inner row block size in tiles
+    uint32_t out_subblock_w = get_compile_time_arg_val(9); // inner column block size in tiles
+    uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w;
+
+    bool spill = num_blocks > uint32_t(1);
+
+    mm_init();
+    bool enable_reload = false;
+
+    for(uint32_t block = 0; block < num_blocks; block++)
+    {
+        bool last_out = block == (num_blocks-1);
+
+        cb_wait_front(tt::CB::c_in0, in0_block_num_tiles);
+        cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+        int in0_index_subblock_offset = 0;
+        for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
+            // kernel_profiler::mark_time(6);
+            int in1_index_subblock_offset = 0;
+            for (uint32_t in1_subblock = 0; in1_subblock < in1_num_subblocks; in1_subblock++) {
+
+                acquire_dst(tt::DstMode::Half);
+
+                if (enable_reload) {
+                    copy_tile_to_dst_init_short();
+                    cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                    for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                        copy_tile(tt::CB::c_intermed0, i, i);
+                    }
+                    cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                    mm_init_short();
+                }
+
+                // Compute output sub-block from in0_subblock x in1_subblock
+                int dst_index = 0;
+                int in0_index_h_offset = 0;
+                for (uint32_t h = 0; h < out_subblock_h; h++) {
+                    for (uint32_t w = 0; w < out_subblock_w; w++) {
+                        int in1_index_inner_dim_offset = 0;
+                        for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
+                            int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
+                            int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
+                            matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                            in1_index_inner_dim_offset += in1_per_core_w;
+                        }
+                        dst_index++;
+                    }
+                    in0_index_h_offset += in0_block_w;
+                }
+
+                if (last_out) {
+                    // Pack out to output buffer
+                    cb_reserve_back(tt::CB::c_out0, out_subblock_num_tiles);
+                    for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                        pack_tile(i, tt::CB::c_out0);
+                    }
+                    cb_push_back(tt::CB::c_out0, out_subblock_num_tiles);
+                } else {
+                    // Move partial result to interm buffer
+                    cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                    for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
+                        pack_tile(i, tt::CB::c_intermed0);
+                    }
+                    cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                }
+
+                release_dst(tt::DstMode::Half);
+                in1_index_subblock_offset += out_subblock_w;
+            }
+            in0_index_subblock_offset += in0_subblock_num_tiles;
+        }
+
+        if (spill) enable_reload = true;
+
+        cb_pop_front(tt::CB::c_in0, in0_block_num_tiles);
+        cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+
+    }
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp
new file mode 100644
index 00000000000..560aba53a77
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#define BCAST_LLKOP ELWADD
+#define BCAST_DIM   BroadcastType::ROW
+
+#include "compute_kernel_api/matmul.h"
+#include "compute_kernel_api/bcast.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+
+    uint32_t block_tile_dim = get_compile_time_arg_val(0);
+    uint32_t dst_tile_rows = get_compile_time_arg_val(1);
+    uint32_t dst_tile_cols = get_compile_time_arg_val(2);
+    uint32_t block_cnt = get_compile_time_arg_val(3);
+    uint32_t in0_block_tile_cnt = get_compile_time_arg_val(4);
+    uint32_t in1_block_tile_cnt = get_compile_time_arg_val(5);
+    uint32_t out_block_tile_cnt = get_compile_time_arg_val(6);
+    uint32_t with_bias = get_compile_time_arg_val(7);
+
+
+    acquire_dst(tt::DstMode::Full);
+
+    mm_init();
+    for(uint32_t b=0;b<block_cnt;++b)
+    {
+        cb_wait_front(tt::CB::c_in0, in0_block_tile_cnt);
+        cb_wait_front(tt::CB::c_in1, in1_block_tile_cnt);
+        int dst_tile_index = 0;
+        int in0_block_tile_index = 0;
+        for(uint32_t r=0;r<dst_tile_rows;++r)
+        {
+            for(uint32_t c=0;c<dst_tile_cols;++c)
+            {
+                int in1_block_tile_index = 0;
+                for(uint32_t i=0;i<block_tile_dim;++i)
+                {
+                    matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_block_tile_index+i, in1_block_tile_index+c, dst_tile_index, false);
+                    in1_block_tile_index += dst_tile_cols;
+                }
+                dst_tile_index++;
+            }
+            in0_block_tile_index += block_tile_dim;
+        }
+        cb_pop_front(tt::CB::c_in0, in0_block_tile_cnt);
+        cb_pop_front(tt::CB::c_in1, in1_block_tile_cnt);
+    }
+
+
+    // add bias in2 to intermed0 and load to dst
+    if (with_bias) {
+        // Pack out
+        cb_reserve_back(tt::CB::c_intermed0, out_block_tile_cnt);
+        for(uint32_t i=0 ; i<out_block_tile_cnt;++i)
+        {
+            pack_tile(i, tt::CB::c_intermed0);
+        }
+        cb_push_back(tt::CB::c_intermed0, out_block_tile_cnt);
+        release_dst(tt::DstMode::Full);
+
+        acquire_dst(tt::DstMode::Full);
+
+        add_bcast_rows_init_short();
+        cb_wait_front(tt::CB::c_intermed0, out_block_tile_cnt);
+        cb_wait_front(tt::CB::c_in2, dst_tile_cols);
+        int dst_tile_index = 0;
+        for(uint32_t r=0;r<dst_tile_rows;++r)
+        {
+            for(uint32_t c=0;c<dst_tile_cols;++c)
+            {
+                add_tiles_bcast<BCAST_DIM>(tt::HlkOperand::intermed0, tt::HlkOperand::in2, dst_tile_index, c, dst_tile_index);
+                dst_tile_index++;
+            }
+        }
+        cb_pop_front(tt::CB::c_in2, dst_tile_cols);
+    }
+
+    // Pack to c_out0
+    cb_reserve_back(tt::CB::c_out0, out_block_tile_cnt);
+    for(uint32_t i=0;i<out_block_tile_cnt;++i)
+    {
+        pack_tile(i, tt::CB::c_out0);
+    }
+
+    cb_push_back(tt::CB::c_out0, out_block_tile_cnt);
+    release_dst(tt::DstMode::Full);
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/max_pool.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool.cpp
new file mode 100644
index 00000000000..2751a6d8b39
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool.cpp
@@ -0,0 +1,142 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+// #include "compute_kernel_api.h"
+#include "compute_kernel_api/tilize.h"
+#include "compute_kernel_api/reduce.h"
+// #include "tools/profiler/kernel_profiler.hpp"
+
+#define DEBUG_PRINT 0
+
+#if DEBUG_PRINT == 1
+    #include "debug_macros.h"
+
+    SliceRange srt = SliceRange{.h0 = 0, .h1 = 32, .hs = 8, .w0 = 0, .w1 = 32, .ws = 4};
+    SliceRange srr = SliceRange{.h0 = 0, .h1 = 1, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1};
+    SliceRange srr1 = SliceRange{.h0 = 1, .h1 = 2, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1};
+    SliceRange src = SliceRange{.h0 = 0, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 1, .ws = 1};
+
+    inline void print_full_tile(uint32_t cb_id, uint32_t tile_id = 0, bool untilize = false) {
+        PDPRINT("======");
+        for (int32_t r = 0; r < 32; ++ r) {
+            SliceRange sr = SliceRange{.h0 = r, .h1 = r+1, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1};
+            PDPRINT((uint)r << TileSlice(cb_id, tile_id, sr, true, untilize));
+        }
+        PDPRINT("++++++");
+    }
+
+    inline void print_cb_details(uint32_t cb_id) {
+        PDPRINT("cb_id " << cb_id << ": { "
+                << "size: " << cb_interface[cb_id].fifo_size << ", "
+                << "limit: " << cb_interface[cb_id].fifo_limit << ", "
+                << "page_size: " << cb_interface[cb_id].fifo_page_size << ", "
+                << "num_pages: " << cb_interface[cb_id].fifo_num_pages << ", "
+                << "rd_ptr: " << cb_interface[cb_id].fifo_rd_ptr << ", "
+                << "wr_ptr: " << cb_interface[cb_id].fifo_wr_ptr << ", "
+                << "wr_tile_ptr: " << cb_interface[cb_id].fifo_wr_tile_ptr << " }");
+    }
+#endif
+
+inline void tilize(uint32_t out_nelems,
+                   uint32_t in_cb_id,
+                   uint32_t in_ntiles_hw,
+                   uint32_t in_ntiles_c,
+                   uint32_t in_ntiles_hwc,
+                   uint32_t window_hw_padded,
+                   uint32_t out_cb_id) {
+    tilize_init_short(in_cb_id, in_ntiles_hwc);
+    for (uint32_t out_elem_i = 0; out_elem_i < out_nelems; ++ out_elem_i) {
+        cb_wait_front(in_cb_id, 1);
+        cb_reserve_back(out_cb_id, in_ntiles_hwc);
+        tilize_block(in_cb_id, in_ntiles_hwc, out_cb_id);  // TODO: need to ensure the ordering for reduction when in_ntiles_hw > 1
+        // print_full_tile(in_cb_id, 0, false);
+        // PDPRINT("OUT TILE :: " << TileSlice(out_cb_id, 0, srr, true, true));
+        // print_cb_details(in_cb_id);
+        cb_push_back(out_cb_id, in_ntiles_hwc);
+        cb_pop_front(in_cb_id, 1);
+    }
+    tilize_uninit();
+}
+
+inline void reduce_h(uint32_t out_nelems,
+                     uint32_t in_cb_id,
+                     uint32_t in_scalar_cb_id,
+                     uint32_t in_ntiles_hw,
+                     uint32_t in_ntiles_c,
+                     uint32_t in_ntiles_hwc,
+                     uint32_t out_ntiles_c,
+                     uint32_t out_cb_id) {
+    cb_wait_front(in_cb_id, in_ntiles_hwc * out_nelems);
+    cb_reserve_back(out_cb_id, out_ntiles_c * out_nelems);
+    reduce_init_delta<false>(PoolType::MAX, ReduceDim::REDUCE_COL, out_cb_id);
+    uint32_t base_tile_id = 0;
+    for (uint32_t c_i = 0; c_i < in_ntiles_c * out_nelems; ++c_i) {
+        // add to accumulator all the in_ntiles_hw in a column of tiles
+        acquire_dst(tt::DstMode::Half);
+        uint32_t dst_i = 0; // TODO [AS]: Use more than one dst tile at a time
+        for (uint32_t hw_i = 0; hw_i < in_ntiles_hw; ++hw_i) {
+            uint32_t tile_i = base_tile_id + hw_i;
+            reduce_tile(PoolType::MAX, ReduceDim::REDUCE_COL, in_cb_id, in_scalar_cb_id, tile_i, 0, dst_i);
+        }
+        pack_tile(dst_i, out_cb_id);
+        release_dst(tt::DstMode::Half);
+        base_tile_id += in_ntiles_hw;
+    }
+    reduce_revert_delta(out_cb_id);
+    cb_push_back(out_cb_id, out_ntiles_c * out_nelems);
+    cb_pop_front(in_cb_id, in_ntiles_hwc * out_nelems);
+}
+
+namespace NAMESPACE {
+
+void MAIN {
+    constexpr uint32_t in_cb_id = tt::CB::c_in0;
+    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in1;
+    constexpr uint32_t in_tiled_cb_id = tt::CB::c_intermed0;
+    constexpr uint32_t out_cb_id = tt::CB::c_out0;
+
+    const uint32_t in_ntiles_hw = get_compile_time_arg_val(0);
+    const uint32_t in_ntiles_c = get_compile_time_arg_val(1);
+    const uint32_t in_ntiles_hwc = get_compile_time_arg_val(2);
+    const uint32_t window_hw_padded = get_compile_time_arg_val(3);
+    const uint32_t out_h = get_compile_time_arg_val(4);
+    const uint32_t out_w = get_compile_time_arg_val(5);
+    const uint32_t out_ntiles_c = get_compile_time_arg_val(7);
+    const uint32_t out_nelems = get_compile_time_arg_val(8);
+    const uint32_t out_w_loop_count = get_compile_time_arg_val(9);
+    const uint32_t nbatch = get_compile_time_arg_val(10);
+    const uint32_t out_h_per_core = get_compile_time_arg_val(11);
+
+    tilize_init(in_cb_id, in_ntiles_hwc, in_tiled_cb_id);
+
+    #if DEBUG_PRINT == 1
+        print_cb_details(in_cb_id);
+        print_cb_details(in_scalar_cb_id);
+        print_cb_details(in_tiled_cb_id);
+        print_cb_details(out_cb_id);
+    #endif
+
+    cb_wait_front(in_scalar_cb_id, 1);
+    for (uint32_t batch = 0; batch < nbatch; ++ batch) {
+        for (uint32_t out_h_i = 0; out_h_i < out_h_per_core; ++out_h_i) {
+            for (uint32_t out_w_i = 0; out_w_i < out_w_loop_count; ++out_w_i) {
+                // NOTE: Assuming in_ntiles_hw < 8 for now.
+                // TODO: subblocking to support this.
+                // kernel_profiler::mark_time(11);
+                // UDPRINT('T' << out_w_i);
+                // tilize
+                tilize(out_nelems, in_cb_id, in_ntiles_hw, in_ntiles_c, in_ntiles_hwc, window_hw_padded, in_tiled_cb_id);
+                // UDPRINT('R' << out_w_i);
+                // Reduce H
+                reduce_h(out_nelems, in_tiled_cb_id, in_scalar_cb_id, in_ntiles_hw, in_ntiles_c, in_ntiles_hwc, out_ntiles_c, out_cb_id);
+                // kernel_profiler::mark_time(12);
+            }
+        }
+    }
+    cb_pop_front(in_scalar_cb_id, 1);
+}
+
+}  // namespace NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp
new file mode 100644
index 00000000000..d06c4094d50
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp
@@ -0,0 +1,138 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+// #include "compute_kernel_api.h"
+#include "compute_kernel_api/tilize.h"
+#include "compute_kernel_api/reduce.h"
+// #include "tools/profiler/kernel_profiler.hpp"
+
+#define DEBUG_PRINT 0
+
+#if DEBUG_PRINT == 1
+    #include "debug_macros.h"
+
+    SliceRange srt = SliceRange{.h0 = 0, .h1 = 32, .hs = 8, .w0 = 0, .w1 = 32, .ws = 4};
+    SliceRange srr = SliceRange{.h0 = 0, .h1 = 1, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1};
+    SliceRange srr1 = SliceRange{.h0 = 1, .h1 = 2, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1};
+    SliceRange src = SliceRange{.h0 = 0, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 1, .ws = 1};
+
+    inline void print_full_tile(uint32_t cb_id, uint32_t tile_id = 0, bool untilize = false) {
+        PDPRINT("======");
+        for (int32_t r = 0; r < 32; ++ r) {
+            SliceRange sr = SliceRange{.h0 = r, .h1 = r+1, .hs = 1, .w0 = 0, .w1 = 32, .ws = 1};
+            PDPRINT((uint)r << TileSlice(cb_id, tile_id, sr, true, untilize));
+        }
+        PDPRINT("++++++");
+    }
+
+    inline void print_cb_details(uint32_t cb_id) {
+        PDPRINT("cb_id " << cb_id << ": { "
+                << "size: " << cb_interface[cb_id].fifo_size << ", "
+                << "limit: " << cb_interface[cb_id].fifo_limit << ", "
+                << "page_size: " << cb_interface[cb_id].fifo_page_size << ", "
+                << "num_pages: " << cb_interface[cb_id].fifo_num_pages << ", "
+                << "rd_ptr: " << cb_interface[cb_id].fifo_rd_ptr << ", "
+                << "wr_ptr: " << cb_interface[cb_id].fifo_wr_ptr << ", "
+                << "wr_tile_ptr: " << cb_interface[cb_id].fifo_wr_tile_ptr << " }");
+    }
+#endif
+
+inline void tilize(uint32_t out_nelems,
+                   uint32_t in_cb_id,
+                   uint32_t in_ntiles_hw,
+                   uint32_t in_ntiles_c,
+                   uint32_t in_ntiles_hwc,
+                   uint32_t window_hw_padded,
+                   uint32_t out_cb_id) {
+    tilize_init_short(in_cb_id, in_ntiles_hwc);
+    for (uint32_t out_elem_i = 0; out_elem_i < out_nelems; ++ out_elem_i) {
+        cb_wait_front(in_cb_id, 1);
+        cb_reserve_back(out_cb_id, in_ntiles_hwc);
+        tilize_block(in_cb_id, in_ntiles_hwc, out_cb_id);  // TODO: need to ensure the ordering for reduction when in_ntiles_hw > 1
+        // print_full_tile(in_cb_id, 0, false);
+        // PDPRINT("OUT TILE :: " << TileSlice(out_cb_id, 0, srr, true, true));
+        // print_cb_details(in_cb_id);
+        cb_push_back(out_cb_id, in_ntiles_hwc);
+        cb_pop_front(in_cb_id, 1);
+    }
+    tilize_uninit();
+}
+
+inline void reduce_h(uint32_t out_nelems,
+                     uint32_t in_cb_id,
+                     uint32_t in_scalar_cb_id,
+                     uint32_t in_ntiles_hw,
+                     uint32_t in_ntiles_c,
+                     uint32_t in_ntiles_hwc,
+                     uint32_t out_ntiles_c,
+                     uint32_t out_cb_id) {
+    cb_wait_front(in_cb_id, in_ntiles_hwc * out_nelems);
+    cb_reserve_back(out_cb_id, out_ntiles_c * out_nelems);
+    reduce_init_delta<false>(PoolType::MAX, ReduceDim::REDUCE_COL, out_cb_id);
+    uint32_t base_tile_id = 0;
+    for (uint32_t c_i = 0; c_i < in_ntiles_c * out_nelems; ++c_i) {
+        // add to accumulator all the in_ntiles_hw in a column of tiles
+        acquire_dst(tt::DstMode::Half);
+        uint32_t dst_i = 0; // TODO [AS]: Use more than one dst tile at a time
+        for (uint32_t hw_i = 0; hw_i < in_ntiles_hw; ++hw_i) {
+            uint32_t tile_i = base_tile_id + hw_i;
+            reduce_tile(PoolType::MAX, ReduceDim::REDUCE_COL, in_cb_id, in_scalar_cb_id, tile_i, 0, dst_i);
+        }
+        pack_tile(dst_i, out_cb_id);
+        release_dst(tt::DstMode::Half);
+        base_tile_id += in_ntiles_hw;
+    }
+    reduce_revert_delta(out_cb_id);
+    cb_push_back(out_cb_id, out_ntiles_c * out_nelems);
+    cb_pop_front(in_cb_id, in_ntiles_hwc * out_nelems);
+}
+
+namespace NAMESPACE {
+
+void MAIN {
+    constexpr uint32_t in_cb_id = tt::CB::c_in0;
+    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in1;
+    constexpr uint32_t in_tiled_cb_id = tt::CB::c_intermed0;
+    constexpr uint32_t out_cb_id = tt::CB::c_out0;
+
+    const uint32_t in_ntiles_hw = get_compile_time_arg_val(0);
+    const uint32_t in_ntiles_c = get_compile_time_arg_val(1);
+    const uint32_t in_ntiles_hwc = get_compile_time_arg_val(2);
+    const uint32_t window_hw_padded = get_compile_time_arg_val(3);
+    const uint32_t out_h = get_compile_time_arg_val(4);
+    const uint32_t out_w = get_compile_time_arg_val(5);
+    const uint32_t out_ntiles_c = get_compile_time_arg_val(7);
+    const uint32_t out_nelems = get_compile_time_arg_val(8);
+    const uint32_t out_w_loop_count = get_compile_time_arg_val(9);
+    const uint32_t nbatch = get_compile_time_arg_val(10);
+    const uint32_t out_h_per_core = get_compile_time_arg_val(11);
+    const uint32_t nsticks_per_core = get_compile_time_arg_val(12);
+    const uint32_t nsticks_per_core_by_nblocks = get_compile_time_arg_val(13);
+
+    tilize_init(in_cb_id, in_ntiles_hwc, in_tiled_cb_id);
+
+    #if DEBUG_PRINT == 1
+        print_cb_details(in_cb_id);
+        print_cb_details(in_scalar_cb_id);
+        print_cb_details(in_tiled_cb_id);
+        print_cb_details(out_cb_id);
+    #endif
+
+    cb_wait_front(in_scalar_cb_id, 1);
+    for (uint32_t i = 0; i < nsticks_per_core_by_nblocks; ++ i) {
+        // NOTE: Assuming in_ntiles_hw < 8 for now.
+        // TODO: subblocking to support this.
+        // kernel_profiler::mark_time(11);
+        // tilize
+        tilize(out_nelems, in_cb_id, in_ntiles_hw, in_ntiles_c, in_ntiles_hwc, window_hw_padded, in_tiled_cb_id);
+        // Reduce H
+        reduce_h(out_nelems, in_tiled_cb_id, in_scalar_cb_id, in_ntiles_hw, in_ntiles_c, in_ntiles_hwc, out_ntiles_c, out_cb_id);
+        // kernel_profiler::mark_time(12);
+    }
+    cb_pop_front(in_scalar_cb_id, 1);
+}
+
+}  // namespace NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
new file mode 100644
index 00000000000..267233258a1
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "debug_print.h"
+
+#include "compute_kernel_api/reduce.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t Ht = get_compile_time_arg_val(0);
+    uint32_t Wt = get_compile_time_arg_val(1);
+    uint32_t NC = get_compile_time_arg_val(2);
+
+    reduce_init<true>(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2);
+    cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
+
+    for (uint32_t nc = 0; nc < NC; nc++) {
+
+        constexpr int onetile = 1;
+        int reduce_dst_idx = 0;
+        for(uint32_t wt = 0; wt < Wt; ++wt) {
+            // tiles are expected to be coming in in NCWH order (H-contiguous)
+            // reducing in W means out[0][w] = sum(h=0..H-1, in[h][w])
+            // in this case we just sequentially add to accumulator all the H-tiles in a column
+            acquire_dst(tt::DstMode::Half);
+            for(uint32_t ht = 0; ht < Ht; ++ht) {
+                cb_wait_front(tt::CB::c_in0, onetile);
+                // REDUCE_OP is expected to come from add_define
+                reduce_tile(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+                cb_pop_front(tt::CB::c_in0, onetile);
+            }
+
+            cb_reserve_back(tt::CB::c_out0, onetile);
+            pack_tile(reduce_dst_idx, tt::CB::c_out0);
+            cb_push_back(tt::CB::c_out0, onetile);
+            release_dst(tt::DstMode::Half);
+        }
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
new file mode 100644
index 00000000000..5c956a4804a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
@@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "debug_print.h"
+
+#include "compute_kernel_api/reduce.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t Ht = get_compile_time_arg_val(0);
+    uint32_t Wt = get_compile_time_arg_val(1);
+    uint32_t NC = get_compile_time_arg_val(2);
+
+    reduce_init<true>(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2);
+
+    cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
+    for (uint32_t nc = 0; nc < NC; nc++) {
+        constexpr int onetile = 1;
+        int reduce_dst_idx = 0;
+        acquire_dst(tt::DstMode::Half);
+        for(uint32_t ht = 0; ht < Ht; ++ht) {
+            // tiles are expected to be coming in in NCHW order (W-contiguous)
+            // reducing in W means out[h][0] = sum(w=0..W-1, in[h][w])
+            // in this case we just sequentially add to accumulator all the W-tiles in a row
+            for(uint32_t wt = 0; wt < Wt; ++wt) {
+                cb_wait_front(tt::CB::c_in0, onetile);
+                // REDUCE_OP/DIM is expected to come from add_define
+                reduce_tile(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+                cb_pop_front(tt::CB::c_in0, onetile);
+            }
+        }
+        cb_reserve_back(tt::CB::c_out0, onetile);
+        pack_tile(reduce_dst_idx, tt::CB::c_out0);
+        cb_push_back(tt::CB::c_out0, onetile);
+        release_dst(tt::DstMode::Half);
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp
new file mode 100644
index 00000000000..64da8ad468d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "debug_print.h"
+
+#include "compute_kernel_api/reduce.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t Ht = get_compile_time_arg_val(0);
+    uint32_t Wt = get_compile_time_arg_val(1);
+    uint32_t NC = get_compile_time_arg_val(2);
+
+    reduce_init<true>(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2);
+
+    cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
+    for (uint32_t nc = 0; nc < NC; nc++) {
+
+        constexpr int onetile = 1;
+        int reduce_dst_idx = 0;
+        for(uint32_t ht = 0; ht < Ht; ++ht) {
+            // tiles are expected to be coming in in NCHW order (W-contiguous)
+            // reducing in W means out[h][0] = sum(w=0..W-1, in[h][w])
+            // in this case we just sequentially add to accumulator all the W-tiles in a row
+            acquire_dst(tt::DstMode::Half);
+            for(uint32_t wt = 0; wt < Wt; ++wt) {
+                cb_wait_front(tt::CB::c_in0, onetile);
+                // REDUCE_OP is expected to come from add_define
+                reduce_tile(REDUCE_OP, REDUCE_DIM, tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+                cb_pop_front(tt::CB::c_in0, onetile);
+            }
+
+            cb_reserve_back(tt::CB::c_out0, onetile);
+            pack_tile(reduce_dst_idx, tt::CB::c_out0);
+            cb_push_back(tt::CB::c_out0, onetile);
+            release_dst(tt::DstMode::Half);
+        }
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp
new file mode 100644
index 00000000000..04e21805ff9
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp
@@ -0,0 +1,217 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#define REDUCE_OP PoolType::SUM
+#define REDUCE_DIM ReduceDim::REDUCE_ROW
+
+#define BCAST_LLKOP EltwiseBinaryType::ELWMUL
+#define BCAST_DIM BroadcastType::COL
+
+#include "compute_kernel_api/reduce.h"
+#include "compute_kernel_api/bcast.h"
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api/layernorm.h"
+
+
+ALWI void ACQ() { acquire_dst(tt::DstMode::Half); }
+ALWI void REL() { release_dst(tt::DstMode::Half); }
+
+
+namespace NAMESPACE {
+void MAIN {
+    const uint32_t NCHt = get_arg_val<uint32_t>(0);
+    constexpr uint32_t Wt = get_compile_time_arg_val(0);
+    constexpr uint32_t blk = get_compile_time_arg_val(1);
+    constexpr uint32_t do_gamma = get_compile_time_arg_val(2);
+    constexpr uint32_t do_beta = get_compile_time_arg_val(3);
+
+
+    #ifdef FUSE_PRE_ADD
+        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
+    #else
+        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+    #endif
+
+    constexpr uint32_t onetile = 1;
+    // reserve one tile for zeros on cb_in2
+    // TODO(AP): check that if DST is indeed zeroed by release_dst (and initially), we can use it as zeroes
+
+    // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct
+    constexpr auto cb_scaler = tt::CB::c_in2; // single tile generated by the reader
+    constexpr auto cb_eps = tt::CB::c_in3; // single tile generated by the reader
+    constexpr auto cb_ex = tt::CB::c_intermed1; // E[x]
+    constexpr auto cb_ex2 = tt::CB::c_intermed2; // E[(x-E[x])^2]
+    constexpr auto cb_x2 = tt::CB::c_intermed3; // x^2
+    constexpr auto cb_ex2pe = tt::CB::c_intermed4; // E[(x-E[x])^2]+eps
+    constexpr auto cb_in = tt::CB::c_in0; // input x or a for fused pre-add (x=a+b)
+    constexpr auto cb_inb = tt::CB::c_in1; // input b for fused pre-add
+    constexpr auto cb_out = tt::CB::c_out0; // output
+    constexpr auto cb_gamma = tt::CB::c_in5;
+    constexpr auto cb_beta = tt::CB::c_in6;
+    constexpr auto cb_fusion = tt::CB::c_intermed5; // stream gamma/beta
+    constexpr auto scaler0 = 0;
+    #ifdef FUSE_PRE_ADD
+    constexpr auto cb_x = tt::CB::c_intermed6;
+    #else
+    constexpr auto cb_x = tt::CB::c_in0;
+    #endif
+
+    cb_wait_front(cb_scaler, 1); // comes from the reader
+    cb_wait_front(cb_eps, 1); // comes from the reader
+
+
+    constexpr int cb_im_or_out = (do_gamma|do_beta) ? cb_fusion : tt::CB::c_out0;
+
+
+    for (uint32_t ncht = 0; ncht < NCHt; ncht++) {
+
+        constexpr int onetile = 1;
+        constexpr int dst0 = 0;
+
+        /*
+         * X + Y
+         */
+        #ifdef FUSE_PRE_ADD
+            add_tiles_init();
+            for (uint32_t wt = 0; wt < Wt; wt += blk) {
+                ACQ();
+                        //UNPACK(( { DPRINT  << "Waiting on cb_x" << ENDL(); } ));
+                cb_wait_front(cb_in, blk);
+                        //UNPACK(( { DPRINT  << "Waiting on cb_inb" << ENDL(); } ));
+                cb_wait_front(cb_inb, blk);
+                        //UNPACK(( { DPRINT  << "Done Waiting on cb_inb" << ENDL(); } ));
+                cb_reserve_back(cb_x, blk);
+                for (uint32_t j = 0; j < blk; j++) {
+                    add_tiles(cb_in, cb_inb, j, j, j);
+                    pack_tile(j, cb_x);
+                }
+                REL();
+                cb_push_back(cb_x, blk); // push the sum into the same buffer
+                cb_pop_front(cb_in, blk);
+                cb_pop_front(cb_inb, blk);
+            }
+            // by the end of this loop we should end up with Wt tiles in cb_x
+        #endif
+
+        /* (x)^2
+         * compute temp = x^2
+         */
+        mul_tiles_init();
+        for (uint32_t wt = 0; wt < Wt; wt += blk) {
+            cb_wait_front(cb_x, wt+blk);
+            cb_reserve_back(cb_x2, blk); // can probably use less space for this if we block
+            ACQ();
+            for (uint32_t wtr = 0; wtr<blk; wtr++) {
+                mul_tiles(cb_x, cb_x, wt+wtr, wt+wtr, wtr);
+                //mul_tiles(cb_xmm, cb_col1, wt+wtr, wt+wtr, wtr);
+                pack_tile(wtr, cb_x2);
+            }
+            cb_push_back(cb_x2, blk);
+            REL();
+        }
+
+        /* Var(x)
+         * compute E[(x)^2]
+         */
+        cb_reserve_back(cb_ex2, 1);
+        reduce_init_delta<false>(REDUCE_OP, REDUCE_DIM);
+        ACQ();
+        cb_wait_front(cb_x2, Wt);
+        //cb_wait_front(cb_xmm, Wt);
+        for (uint32_t wt = 0; wt < Wt; wt += blk) {
+            // reduce
+            for (uint32_t wtr = 0; wtr<blk; wtr++)
+                reduce_tile(REDUCE_OP, REDUCE_DIM, cb_x2, cb_scaler, wt+wtr, scaler0, dst0);
+                //reduce_tile(REDUCE_OP, REDUCE_DIM, cb_xmm, cb_scaler, wt+wtr, scaler0, dst0);
+        }
+        cb_pop_front(cb_x2, Wt);
+        reduce_revert_delta();
+        pack_tile(dst0, cb_ex2);
+        REL();
+
+        cb_push_back(cb_ex2, 1);
+        cb_wait_front(cb_ex2, 1);
+
+        /* Var(x) + eps
+         * add epsilon E[(x-E[x])^2]+eps
+         */
+        ACQ();
+        add_tiles_init();
+        add_tiles(cb_ex2, cb_eps, 0, 0, dst0);
+
+        cb_reserve_back(cb_ex2pe, 1); // 1
+        sqrt_tile_init();
+        sqrt_tile(dst0);
+        recip_tile_init();
+        recip_tile(dst0);
+        pack_tile(dst0, cb_ex2pe);
+        cb_push_back(cb_ex2pe, 1);
+        REL();
+
+        /* ln(x) * gamma + beta (gamma and beta are optional)
+         * now xmm = (x-E[x])
+         * we have 1.0/sqrt( E[(x-E[x])^2] + eps) in cb_ex2pe
+         * just need to bcast_mul xmm with cb_ex2pe
+         */
+        cb_reserve_back(cb_ex2pe, 1); // 2
+        cb_wait_front(cb_ex2pe, 1);
+        for (uint32_t wt = 0; wt < Wt; wt += blk) {
+                        //if (ht == 1) UNPACK(( DPRINT << "wt_2=" << wt << " " ));
+                        //if (ht == 1) UNPACK(( DPRINT << "rem_2=" << rem << ENDL() ));
+            cb_reserve_back(cb_im_or_out, blk);
+
+            ACQ();
+            mul_bcast_cols_init_short();
+            for (uint32_t wtr = 0; wtr < blk; wtr++) {
+                // cb_xmm[wt+wtr] since we pop Wt from cb_xmm after the entire loop
+                mul_tiles_bcast_cols(cb_x, cb_ex2pe, wt+wtr, 0, wtr); // tile *= 1/(sum(exp(x)))
+                pack_tile(wtr, cb_im_or_out); // pack either to intermediate (cb_fusion or out0)
+            }
+            cb_push_back(cb_im_or_out, blk); // if no gamma/beta are provided, this will be passed on to the writer
+            REL();
+
+            if (do_gamma) {
+                ACQ();
+                uint32_t cb_outg = do_beta ? cb_fusion : tt::CB::c_out0;
+                mul_bcast_rows_init_short();
+                cb_reserve_back(cb_outg, blk);
+                cb_wait_front(cb_gamma, wt+blk); // we don't pop, TODO: only wait on first ht
+                cb_wait_front(cb_fusion, blk);
+                for (uint32_t wtr = 0; wtr < blk; wtr++) {
+                    mul_tiles_bcast_rows(cb_fusion, cb_gamma, wtr, wt+wtr, wtr); // tile *= 1/(sum(exp(x)))
+                    pack_tile(wtr, cb_outg); // pack either to intermediate (cb_fusion or out0)
+                }
+                cb_pop_front(cb_fusion, blk);
+                // we don't pop gamma
+                cb_push_back(cb_outg, blk);
+                // We don't pop gamma since it's 1,1,1,Wt and we reuse it for all NCHt
+                REL();
+            }
+            if (do_beta) {
+                ACQ();
+                add_bcast_rows_init_short();
+                cb_reserve_back(tt::CB::c_out0, blk);
+                cb_wait_front(cb_beta, wt+blk); // TODO: optimization - only wait on first ht
+                cb_wait_front(cb_fusion, blk);
+                for (uint32_t wtr = 0; wtr < blk; wtr++) {
+                    add_tiles_bcast_rows(cb_fusion, cb_beta, wtr, wt+wtr, wtr); // tile *= 1/(sum(exp(x)))
+                    pack_tile(wtr, tt::CB::c_out0); // pack either to intermediate (cb_fusion or out0)
+                }
+                cb_pop_front(cb_fusion, blk);
+                // We don't pop beta since it's 1,1,1,Wt and we reuse it for all NCHt
+                cb_push_back(tt::CB::c_out0, blk);
+                REL();
+            }
+        }
+        cb_pop_front(cb_ex2pe, 1);
+        cb_pop_front(cb_x, Wt);
+
+    } // NCHt loop
+    //cb_pop_front(cb_scaler, 1); // optional for correctness
+    //cb_pop_front(cb_eps, 1); // optional for correctness
+    //cb_pop_front(cb_col1, 1); // optional for correctness
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/rotary_embedding.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/rotary_embedding.cpp
new file mode 100644
index 00000000000..d7bcb09fd95
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/rotary_embedding.cpp
@@ -0,0 +1,148 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/common.h"
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api/bcast.h"
+#include "compute_kernel_api/tilize.h"
+#include "compute_kernel_api/untilize.h"
+
+ALWI void ACQ() { acquire_dst(tt::DstMode::Half); }
+ALWI void REL() { release_dst(tt::DstMode::Half); }
+
+ALWI void MUL_TILES(uint32_t in0_cb, uint32_t in1_cb, uint32_t out_cb, uint32_t num_tiles, uint32_t in1_idx) {
+    // Multiply input by cos
+    cb_wait_front(in0_cb, num_tiles);
+    cb_wait_front(in1_cb, in1_idx + 1);
+    cb_reserve_back(out_cb, num_tiles);
+
+    #ifdef DECODE_MODE
+    ACQ();
+    mul_bcast_rows_init_short();
+    mul_tiles_bcast_rows(in0_cb, in1_cb, 0, in1_idx, 0);
+    pack_tile(0, out_cb);
+    REL();
+    cb_push_back(out_cb, num_tiles);
+    cb_pop_front(in0_cb, num_tiles);
+    // We don't pop in1 in decode which is sin/cos since we don't stream
+    #else
+    ACQ();
+    mul_tiles_init();
+    mul_tiles(in0_cb, in1_cb, 0, 0, 0);
+    pack_tile(0, out_cb);
+    REL();
+    cb_push_back(out_cb, num_tiles);
+    cb_pop_front(in0_cb, num_tiles);
+    cb_pop_front(in1_cb, num_tiles);
+    #endif
+}
+
+ALWI void UNTILIZE_TILES(uint32_t in0_cb, uint32_t out_cb, uint32_t num_tiles) {
+    untilize_init_short(in0_cb);
+    cb_wait_front(in0_cb, num_tiles);
+    cb_reserve_back(out_cb, num_tiles);
+    untilize_block(in0_cb, num_tiles, out_cb);
+    cb_push_back(out_cb, num_tiles);
+    cb_pop_front(in0_cb, num_tiles);
+    untilize_uninit(in0_cb);
+}
+
+ALWI void TILIZE_ROWS(uint32_t in0_cb, uint32_t sync_cb, uint32_t out_cb, uint32_t num_tiles) {
+    tilize_init_short(in0_cb, num_tiles);
+    cb_wait_front(sync_cb, num_tiles);
+    cb_reserve_back(out_cb, num_tiles);
+    tilize_block(in0_cb, num_tiles, out_cb);
+    cb_push_back(out_cb, num_tiles);
+
+    // Pop shared cbs after tilize
+    cb_pop_front(in0_cb, num_tiles);
+    cb_pop_front(sync_cb, num_tiles);
+    tilize_uninit();
+}
+
+namespace NAMESPACE {
+void MAIN {
+    constexpr uint32_t onetile = 1;
+
+    constexpr uint32_t in_cb = get_compile_time_arg_val(0);
+    constexpr uint32_t rotated_in_cb = get_compile_time_arg_val(1);
+    constexpr uint32_t cos_cb = get_compile_time_arg_val(2);
+    constexpr uint32_t sin_cb = get_compile_time_arg_val(3);
+    constexpr uint32_t scalar_cb = get_compile_time_arg_val(4);
+    constexpr uint32_t rotated_in_interm_cb = get_compile_time_arg_val(5);
+    constexpr uint32_t cos_interm_cb = get_compile_time_arg_val(6);
+    constexpr uint32_t sin_interm_cb = get_compile_time_arg_val(7);
+    constexpr uint32_t out_cb = get_compile_time_arg_val(8);
+    constexpr uint32_t num_rows = get_compile_time_arg_val(9);
+    constexpr uint32_t Wt = get_compile_time_arg_val(10);
+    constexpr uint32_t half_Wt = get_compile_time_arg_val(11);
+
+    binary_op_init_common(in_cb, cos_cb);
+
+    cb_wait_front(scalar_cb, onetile);
+
+    uint32_t updated_cos_cb = cos_cb;
+    uint32_t updated_sin_cb = sin_cb;
+
+    #ifdef DECODE_MODE
+    constexpr uint32_t untilized_cos_cb = get_compile_time_arg_val(12);
+    constexpr uint32_t untilized_cos_sync_cb = get_compile_time_arg_val(13);
+    constexpr uint32_t untilized_sin_cb = get_compile_time_arg_val(14);
+    constexpr uint32_t untilized_sin_sync_cb = get_compile_time_arg_val(15);
+    constexpr uint32_t retilized_cos_cb = get_compile_time_arg_val(16);
+    constexpr uint32_t retilized_sin_cb = get_compile_time_arg_val(17);
+    UNTILIZE_TILES(sin_cb, untilized_sin_cb, Wt);
+    UNTILIZE_TILES(cos_cb, untilized_cos_cb, Wt);
+    TILIZE_ROWS(untilized_sin_cb, untilized_sin_sync_cb, retilized_sin_cb, Wt);
+    TILIZE_ROWS(untilized_cos_cb, untilized_cos_sync_cb, retilized_cos_cb, Wt);
+    updated_cos_cb = retilized_cos_cb;
+    updated_sin_cb = retilized_sin_cb;
+    #endif
+    uint32_t in1_idx = 0;
+    for (uint32_t i = 0; i < num_rows; i++) {
+        for (uint32_t j = 0; j < Wt; j++) {
+            #ifdef DECODE_MODE
+            in1_idx = j;
+            #endif
+            if (j < half_Wt) {
+                // Multiply half of the rotated input by scalar (-1)
+                cb_wait_front(rotated_in_cb, onetile);
+                cb_reserve_back(rotated_in_interm_cb, onetile);
+                ACQ();
+                mul_tiles_bcast_scalar_init_short();
+                mul_tiles_bcast_scalar(rotated_in_cb, scalar_cb, 0, 0, 0);
+                pack_tile(0, rotated_in_interm_cb);
+                REL();
+                cb_push_back(rotated_in_interm_cb, onetile);
+                cb_pop_front(rotated_in_cb, onetile);
+                // Multiply rotated input by sin
+                MUL_TILES(rotated_in_interm_cb, updated_sin_cb, sin_interm_cb, onetile, in1_idx);
+            } else {
+                // Multiply rotated input by sin
+                MUL_TILES(rotated_in_cb, updated_sin_cb, sin_interm_cb, onetile, in1_idx);
+            }
+
+            // Multiply input by cos
+            MUL_TILES(in_cb, updated_cos_cb, cos_interm_cb, onetile, in1_idx);
+
+            // Add applied sin/cos tensors
+            cb_wait_front(cos_interm_cb, onetile);
+            cb_wait_front(sin_interm_cb, onetile);
+            cb_reserve_back(out_cb, onetile);
+
+            ACQ();
+            add_tiles_init();
+            add_tiles(cos_interm_cb, sin_interm_cb, 0, 0, 0);
+            pack_tile(0, out_cb);
+            REL();
+            cb_push_back(out_cb, onetile);
+            cb_pop_front(cos_interm_cb, onetile);
+            cb_pop_front(sin_interm_cb, onetile);
+
+        }
+    }
+}
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/softmax.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/softmax.cpp
new file mode 100644
index 00000000000..2005b928bfc
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/softmax.cpp
@@ -0,0 +1,168 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#define REDUCE_OP PoolType::SUM
+#define REDUCE_DIM ReduceDim::REDUCE_ROW
+
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/bcast.h"
+#include "compute_kernel_api/softmax.h"
+#include "compute_kernel_api/reduce.h"
+
+ALWI void ACQ() { acquire_dst(tt::DstMode::Half); }
+ALWI void REL() { release_dst(tt::DstMode::Half); }
+
+// for scale+mask+softmax:
+// bcast HW (mul by 1 tile)  example: (  [2,1,1024,64] * [1,1,32,32]  )
+// bcast add H               example: ( [2,1,1024,64] + [2,1,32,64] ) (bcast W -> H)
+// Note that the attention mask will not fit in L1 for the entire tensor
+// The buffer for the att mask is currently sized as (1t,Wt) so we only reuse it for one HtWt-sized batch of x
+// then read another Wt tiles of mask for the next batch
+
+namespace NAMESPACE {
+void MAIN {
+
+    const uint32_t NCHt = get_arg_val<uint32_t>(0);
+    const uint32_t Ht = get_arg_val<uint32_t>(1);
+    const uint32_t Wt = get_arg_val<uint32_t>(2);
+    const uint32_t ndst = get_arg_val<uint32_t>(3);
+    const uint32_t start_ht = get_arg_val<uint32_t>(4);
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2);
+
+    constexpr uint32_t onetile = 1;
+    // reserve one tile for zeros on cb_in2
+    // We only do the reserve for the intermediates once and use pack_tile
+    // So effectively these are used as pre-allocated arrays
+    // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct
+    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
+    constexpr auto cb_fused_scale = tt::CB::c_in3;
+    constexpr auto cb_fused_attn = tt::CB::c_in4;
+    constexpr auto cb_exps = tt::CB::c_intermed0;
+    constexpr auto cb_scale_mask = tt::CB::c_intermed3;
+    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
+    constexpr auto cb_in0 = tt::CB::c_in0;
+    constexpr auto cb_out0 = tt::CB::c_out0;
+
+
+    cb_wait_front(cb_bcast_scaler, 1); // comes from the reader
+
+    #if FUSED_SCALE_MASK
+    cb_wait_front(cb_fused_scale, 1);
+    #endif
+
+    constexpr int dst0 = 0;
+    uint32_t ht = start_ht;
+    bool wait_mask = true;
+    for (uint32_t ncht = 0; ncht < NCHt; ncht++) {
+        #if FUSED_SCALE_MASK
+        for (uint32_t wt = 0; wt < Wt; wt+=ndst) {
+            // apply fused scale [*= 1/sqrt(...)]
+            ACQ();
+            mul_tiles_bcast_scalar_init_short();
+            cb_wait_front(cb_in0, ndst);
+            cb_reserve_back(cb_scale_mask, ndst);
+            for (uint32_t wt8 = 0; wt8 < ndst; wt8++) {
+                mul_tiles_bcast_scalar(cb_in0, cb_fused_scale, wt8, 0, wt8); // mul bcast-HW -> DST[wt8]
+                pack_tile(wt8, cb_scale_mask); // reuse exps buffer
+            }
+            cb_push_back(cb_scale_mask, ndst);
+            cb_pop_front(cb_in0, ndst);
+            REL();
+        }
+
+        for (uint32_t wt = 0; wt < Wt; wt+=ndst) {
+            ACQ();
+            if (wait_mask) {
+                cb_wait_front(cb_fused_attn, wt+ndst); // cumulative wait for up to Wt tiles, only at first ht
+            }
+            cb_wait_front(cb_scale_mask, ndst);
+            add_bcast_rows_init_short();
+            for (uint32_t wt8 = 0; wt8 < ndst; wt8++) {
+                add_tiles_bcast_rows(cb_scale_mask, cb_fused_attn, wt8, wt+wt8, wt8); // tile *= 1/(sum(exp(x)))
+            }
+            cb_pop_front(cb_scale_mask, ndst);
+            cb_reserve_back(cb_exps, ndst);
+            exp_tile_init(true);
+            for (uint32_t wt8 = 0; wt8 < ndst; wt8++) {
+                exp_tile(wt8,true); // exp on DST[0]
+                pack_tile(wt8, cb_exps); // reuse the exps buffer again, this time in a circular manner
+            }
+            cb_push_back(cb_exps, ndst);
+            REL();
+        }
+        if (wait_mask) {
+            wait_mask = false;
+        }
+        ht++;
+        if (ht == Ht) {
+            cb_pop_front(cb_fused_attn, Wt);
+            ht = 0;
+            wait_mask = true;
+        }
+        #else
+
+        for (uint32_t wt = 0; wt < Wt; wt+=ndst) {
+
+            ACQ();
+            cb_wait_front(cb_in0, ndst);
+            copy_tile_init(); // need to copy from CB to DST to be able to run sfpu math
+            for (uint32_t wt8 = 0; wt8 < ndst; ++wt8) {
+                copy_tile(cb_in0, wt8, wt8); // copy from c_in[0] to DST[0]
+            }
+            cb_pop_front(cb_in0, ndst);
+
+            cb_reserve_back(cb_exps, ndst);
+            exp_tile_init(true);
+            for (uint32_t wt8 = 0; wt8 < ndst; ++wt8) {
+                exp_tile(wt8, true); // exp on DST[0]
+                pack_tile(wt8, cb_exps); // DST[0]->cb_id[wt]
+            }
+            cb_push_back(cb_exps, ndst);
+            REL();
+        }
+        #endif
+
+        ACQ();
+        cb_reserve_back(cb_recipsumexps, onetile);
+        reduce_init_delta<false>(REDUCE_OP, REDUCE_DIM);
+        for (uint32_t wt = 0; wt < Wt; wt++) {
+            cb_wait_front(cb_exps, wt+1); // must be a cumulative wait for correctness
+            constexpr uint32_t bcast_scaler0 = 0; // 0th index from bcast_scaler CB
+            reduce_tile(REDUCE_OP, REDUCE_DIM, cb_exps, cb_bcast_scaler, wt, bcast_scaler0, dst0);
+        }
+        reduce_revert_delta();
+        recip_tile_init();
+        recip_tile(dst0); // DST[0] = 1/sum(exp(x))
+        pack_tile(dst0, cb_recipsumexps);
+        cb_push_back(cb_recipsumexps, 1);
+
+        REL();
+
+
+        cb_wait_front(cb_recipsumexps, 1); // will reuse Wt times for bcast
+
+        // now cb_sumexps has exp tiles, need to multiply by our DST[2]
+        // by now we already did a umulative wait for Wt tiles in cb_exps
+        mul_bcast_cols_init_short();
+        for (uint32_t wt = 0; wt < Wt; wt += ndst) {
+            ACQ();
+            cb_reserve_back(tt::CB::c_out0, ndst);
+            for (uint32_t wt8 = 0; wt8 < ndst; wt8++) {
+                // wt+wt8 since we pop Wt after the entire loop
+                mul_tiles_bcast<BroadcastType::COL>(cb_exps, cb_recipsumexps, wt+wt8, 0, wt8); // tile *= 1/(sum(exp(x)))
+                pack_tile(wt8, tt::CB::c_out0);
+            }
+            cb_push_back(tt::CB::c_out0, ndst);
+            REL();
+        }
+        cb_pop_front(cb_recipsumexps, 1);
+        cb_pop_front(cb_exps, Wt);
+    } // NCHt loop
+    //cb_pop_front(cb_bcast_scaler, 1); // we don't actually have to do this
+    //cb_pop_front(cb_fused_scale, 1); // we don't actually have to do this
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp
new file mode 100644
index 00000000000..2218d5393f5
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/tilize.h"
+
+//#include "debug_print.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
+    uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
+    //UNPACK(( DPRINT << "Block count=" << uint32_t(per_core_block_cnt) << " tile count=" << per_core_block_tile_cnt << ENDL() ));
+    tilize_init(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+
+    for(uint32_t b=0;b<per_core_block_cnt;++b)
+    {
+        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+
+        tilize_block(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+
+        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/transformer_attn_matmul.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/transformer_attn_matmul.cpp
new file mode 100644
index 00000000000..67581b302d9
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/transformer_attn_matmul.cpp
@@ -0,0 +1,87 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/matmul.h"
+#include "compute_kernel_api/tilize.h"
+#include "compute_kernel_api/untilize.h"
+
+using std::uint32_t;
+
+// matmul C=A*B using dims MK*KN = MN (row major order)
+//
+namespace NAMESPACE {
+void MAIN {
+
+    constexpr uint32_t onetile = 1;
+
+    constexpr uint32_t transpose_hw = get_compile_time_arg_val(0);
+    uint32_t batch = get_arg_val<uint32_t>(0);
+    uint32_t Mt = get_arg_val<uint32_t>(1);
+    uint32_t Kt = get_arg_val<uint32_t>(2);
+    uint32_t Nt = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t cb_intermed0 = 24;
+    constexpr uint32_t cb_intermed1 = 25;
+    constexpr uint32_t cb_intermed2 = 26;
+    constexpr uint32_t out_cb_id = 16;
+
+    constexpr uint32_t num_rows_in_one_tile = 32;
+
+    mm_init(tt::CB::c_in0, tt::CB::c_in1, out_cb_id, transpose_hw);
+
+    for (uint32_t nb = 0; nb < batch; nb++)
+    for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C) // output tile of C
+    for (uint32_t nt_C = 0; nt_C < Nt; ++nt_C) // output tile index of C
+    {
+        for (uint32_t tile_row_id = 0; tile_row_id < num_rows_in_one_tile; tile_row_id++) {
+            acquire_dst(tt::DstMode::Half);
+            for (uint32_t kt = 0; kt < Kt; kt++) {
+                if (tile_row_id == 0) {
+                    cb_wait_front(tt::CB::c_in0, kt+1);
+                }
+                cb_wait_front(tt::CB::c_in1, onetile);
+
+                matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, kt, 0, 0, transpose_hw);
+
+                cb_pop_front(tt::CB::c_in1, onetile);
+            }
+
+            cb_reserve_back(cb_intermed0, onetile);
+            pack_tile(0, cb_intermed0);
+            release_dst(tt::DstMode::Half);
+            cb_push_back(cb_intermed0, onetile);
+
+            // untilize tile and write to CB::c_intermed1
+            cb_wait_front(cb_intermed0, onetile);
+            untilize_init_short(cb_intermed0);
+            cb_reserve_back(cb_intermed1, 1);
+            untilize_block(cb_intermed0, 1, cb_intermed1);
+            cb_push_back(cb_intermed1, 1);
+
+            cb_pop_front(cb_intermed0, 1);
+            untilize_uninit(cb_intermed0);
+
+            mm_init_short(transpose_hw);
+        }
+        cb_pop_front(tt::CB::c_in0, Kt);
+
+        // cb_intermed2 comes from reader; untilized row-major tile
+        cb_wait_front(cb_intermed2, 1);
+        cb_reserve_back(tt::CB::c_out0, onetile);
+
+        // tilize CB::intermed2 and write to CB::c_out0
+        tilize_init_short(cb_intermed2, 1);
+        tilize_block(cb_intermed2, 1, out_cb_id);
+        cb_push_back(out_cb_id, 1);
+
+        cb_pop_front(cb_intermed2, 1);
+        tilize_uninit();
+
+        mm_init_short(transpose_hw);
+    }
+
+}
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp
new file mode 100644
index 00000000000..42452f19fd4
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp
@@ -0,0 +1,32 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/transpose_wh.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t NHtWt = get_compile_time_arg_val(0);
+    transpose_wh_init(tt::CB::c_in0);
+
+    // transpose a row-major block:
+    // - assumes the tiles come in in column major order from reader
+    // - uses reader_unary_transpose_wh
+    // - transpose_wh each tile
+    for (uint32_t n = 0; n < NHtWt; n++) {
+        cb_wait_front(tt::CB::c_in0, 1);
+        cb_reserve_back(tt::CB::c_out0, 1);
+
+        acquire_dst(tt::DstMode::Half);
+        transpose_wh_tile(tt::CB::c_in0, 0, 0);
+        pack_tile(0, tt::CB::c_out0);
+        release_dst(tt::DstMode::Half);
+
+        cb_push_back(tt::CB::c_out0, 1);
+        cb_pop_front(tt::CB::c_in0, 1);
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp
new file mode 100644
index 00000000000..2b31e7b49dd
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/matmul.h"
+#include "compute_kernel_api.h"
+
+namespace NAMESPACE {
+void MAIN {
+    const uint32_t in0_cb = get_compile_time_arg_val(0);
+    const uint32_t in1_cb = get_compile_time_arg_val(1);
+    const uint32_t out_cb = get_compile_time_arg_val(2);
+    const uint32_t partials_cb = get_compile_time_arg_val(3);
+    const uint32_t in0_block_num_tiles = get_compile_time_arg_val(4);
+    const uint32_t in1_block_num_tiles = get_compile_time_arg_val(5);
+    const uint32_t out_block_num_tiles = get_compile_time_arg_val(6);
+    const uint32_t out_r = get_compile_time_arg_val(7);
+    const uint32_t out_c = get_compile_time_arg_val(8);
+    const uint32_t in0_k = get_compile_time_arg_val(9);
+    const uint32_t num_blocks = get_compile_time_arg_val(10);
+    const bool transpose = false;
+    const uint32_t last_block_id = num_blocks - 1;
+
+    // we are looking at block
+    // out = in0[r x k]*in1[k x c]
+    mm_init();
+    for (uint32_t block_id = 0; block_id < num_blocks; block_id++) {
+        acquire_dst(tt::DstMode::Half);
+        if (block_id > 0) {
+            copy_tile_to_dst_init_short();
+            cb_wait_front(partials_cb, out_block_num_tiles);
+            for (uint32_t i = 0; i < out_block_num_tiles; i++) {
+                copy_tile(partials_cb, i, i);
+            }
+            cb_pop_front(partials_cb, out_block_num_tiles);
+            mm_init_short();
+        }
+        uint32_t out_tile_index = 0;
+        uint32_t in0_index_r_offset = 0;
+        cb_wait_front(in0_cb, in0_block_num_tiles);
+        cb_wait_front(in1_cb, in1_block_num_tiles);
+        for (uint32_t r = 0; r < out_r; r++) {
+            for (uint32_t c = 0; c < out_c; c++) {
+                uint32_t in1_index_c_offset = 0;
+                for (uint32_t k = 0; k < in0_k; k++) {
+                    int in0_tile_index = in0_index_r_offset + k;
+                    int in1_tile_index = in1_index_c_offset + c;
+                    matmul_tiles(in0_cb, in1_cb, in0_tile_index, in1_tile_index, out_tile_index, transpose);
+                    in1_index_c_offset += k;
+                }
+                out_tile_index++;
+            }
+            in0_index_r_offset += in0_k;
+        }
+        cb_pop_front(in0_cb, in0_block_num_tiles);
+        cb_pop_front(in1_cb, in1_block_num_tiles);
+
+        for (uint32_t tile_index = 0; tile_index < out_block_num_tiles; tile_index++) {
+            if (block_id == last_block_id) {
+                cb_reserve_back(out_cb, out_block_num_tiles);
+                pack_tile(tile_index, out_cb);
+                cb_push_back(out_cb, out_block_num_tiles);
+            } else {
+                cb_reserve_back(partials_cb, out_block_num_tiles);
+                pack_tile(tile_index, partials_cb);
+                cb_push_back(partials_cb, out_block_num_tiles);
+            }
+        }
+        release_dst(tt::DstMode::Half);
+    }
+}
+}  // namespace NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp
new file mode 100644
index 00000000000..6190640c8bd
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/matmul.h"
+#include "compute_kernel_api.h"
+
+namespace NAMESPACE {
+void MAIN {
+    const uint32_t in0_cb = get_compile_time_arg_val(0);
+    const uint32_t in1_cb = get_compile_time_arg_val(1);
+    const uint32_t out_cb = get_compile_time_arg_val(2);
+    const uint32_t in0_num_tiles = get_compile_time_arg_val(3);
+    const uint32_t in1_num_tiles = get_compile_time_arg_val(4);
+    const uint32_t out_num_tiles = get_compile_time_arg_val(5);
+    const uint32_t out_r = get_compile_time_arg_val(6);
+    const uint32_t out_c = get_compile_time_arg_val(7);
+    const uint32_t in0_k = get_compile_time_arg_val(8);
+    const bool transpose = false;
+
+    // we are looking at block
+    // out = in0[r x k]*in1[k x c]
+    mm_init();
+    acquire_dst(tt::DstMode::Half);
+
+    uint32_t out_tile_index = 0;
+    uint32_t in0_index_r_offset = 0;
+    cb_wait_front(in0_cb, in0_num_tiles);
+    cb_wait_front(in1_cb, in1_num_tiles);
+    for (uint32_t r = 0; r < out_r; r++) {
+        for (uint32_t c = 0; c < out_c; c++) {
+            uint32_t in1_index_c_offset = 0;
+            for (uint32_t k = 0; k < in0_k; k++) {
+                int in0_tile_index = in0_index_r_offset + k;
+                int in1_tile_index = in1_index_c_offset + c;
+                matmul_tiles(in0_cb, in1_cb, in0_tile_index, in1_tile_index, out_tile_index, transpose);
+                in1_index_c_offset += k;
+            }
+            out_tile_index++;
+        }
+        in0_index_r_offset += in0_k;
+    }
+    cb_pop_front(in0_cb, in0_num_tiles);
+    cb_pop_front(in1_cb, in1_num_tiles);
+
+    cb_reserve_back(out_cb, out_num_tiles);
+    for (uint32_t tile_index = 0; tile_index < out_num_tiles; tile_index++) {
+        pack_tile(tile_index, out_cb);
+    }
+    cb_push_back(out_cb, out_num_tiles);
+    release_dst(tt::DstMode::Half);
+}
+}  // namespace NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp
new file mode 100644
index 00000000000..60105fd134c
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    const uint32_t in0_cb = get_compile_time_arg_val(0);
+    const uint32_t in1_cb = get_compile_time_arg_val(1);
+    uint32_t src0_addr = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t src1_addr = get_arg_val<uint32_t>(3);
+    uint32_t src1_noc_x = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_y = get_arg_val<uint32_t>(5);
+    uint32_t num_tiles = get_arg_val<uint32_t>(6);
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes_0 = get_tile_size(in0_cb);
+    uint32_t ublock_size_bytes_1 = get_tile_size(in1_cb);
+    uint32_t ublock_size_tiles = 1;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker)
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+        uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+
+        cb_reserve_back(in0_cb, ublock_size_tiles);
+        cb_reserve_back(in1_cb, ublock_size_tiles);
+
+        l1_write_addr_in0 = get_write_ptr(in0_cb);
+        l1_write_addr_in1 = get_write_ptr(in1_cb);
+
+        noc_async_read(src0_noc_addr, l1_write_addr_in0, ublock_size_bytes_0);
+        noc_async_read(src1_noc_addr, l1_write_addr_in1, ublock_size_bytes_1);
+
+        noc_async_read_barrier();
+
+        cb_push_back(in0_cb, ublock_size_tiles);
+        cb_push_back(in1_cb, ublock_size_tiles);
+
+        src0_addr += ublock_size_bytes_0;
+        src1_addr += ublock_size_bytes_1;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp
new file mode 100644
index 00000000000..69f230a6001
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    const uint32_t in0_cb = get_compile_time_arg_val(0);
+    const uint32_t in1_cb = get_compile_time_arg_val(1);
+    uint32_t src0_addr = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t src1_addr = get_arg_val<uint32_t>(3);
+    uint32_t src1_noc_x = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_y = get_arg_val<uint32_t>(5);
+    uint32_t num_blocks = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_tile_cnt = get_arg_val<uint32_t>(7);
+    uint32_t in1_block_tile_cnt = get_arg_val<uint32_t>(8);
+    uint32_t in0_block_size_bytes = get_arg_val<uint32_t>(9);
+    uint32_t in1_block_size_bytes = get_arg_val<uint32_t>(10);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    for (uint32_t i = 0; i < num_blocks; i++) {
+        uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+        uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+
+        cb_reserve_back(in0_cb, in0_block_tile_cnt);
+        cb_reserve_back(in1_cb, in1_block_tile_cnt);
+
+        l1_write_addr_in0 = get_write_ptr(in0_cb);
+        l1_write_addr_in1 = get_write_ptr(in1_cb);
+
+        noc_async_read(src0_noc_addr, l1_write_addr_in0, in0_block_size_bytes);
+        noc_async_read(src1_noc_addr, l1_write_addr_in1, in1_block_size_bytes);
+
+        noc_async_read_barrier();
+        auto ptr0 = reinterpret_cast<volatile tt_l1_ptr uint32_t*> (l1_write_addr_in0);
+        auto ptr1 = reinterpret_cast<volatile tt_l1_ptr uint32_t*> (l1_write_addr_in1);
+
+        cb_push_back(in0_cb, in0_block_tile_cnt);
+        cb_push_back(in1_cb, in1_block_tile_cnt);
+
+        src0_addr += in0_block_size_bytes;
+        src1_addr += in1_block_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp
new file mode 100644
index 00000000000..8792a0af75e
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/matmul.h"
+#include "compute_kernel_api.h"
+
+namespace NAMESPACE {
+void MAIN {
+    const uint32_t in0_cb = get_compile_time_arg_val(0);
+    const uint32_t in1_cb = get_compile_time_arg_val(1);
+    const uint32_t out_cb = get_compile_time_arg_val(2);
+    const uint32_t num_in0_tiles = 1;
+    const uint32_t num_in1_tiles = 1;
+    const uint32_t num_out_tiles = 1;
+    const uint32_t in0_tile_index = 0;
+    const uint32_t in1_tile_index = 0;
+    const uint32_t out_tile_index = 0;
+    const bool transpose = false;
+    mm_init();
+    cb_reserve_back(out_cb, num_out_tiles);
+    acquire_dst(tt::DstMode::Half);
+    cb_wait_front(in0_cb, num_in0_tiles);
+    cb_wait_front(in1_cb, num_in1_tiles);
+    matmul_tiles(in0_cb, in1_cb, in0_tile_index, in1_tile_index, out_tile_index, transpose);
+    pack_tile(0, out_cb);
+    cb_pop_front(in0_cb, num_in0_tiles);
+    cb_pop_front(in1_cb, num_in1_tiles);
+    release_dst(tt::DstMode::Half);
+    cb_push_back(out_cb, num_out_tiles);
+}
+}  // namespace NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp
new file mode 100644
index 00000000000..3caec0ae567
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    const uint32_t out_cb = get_compile_time_arg_val(0);
+    uint32_t dst_addr = get_arg_val<uint32_t>(0);
+    uint32_t dst_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t dst_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes = get_tile_size(out_cb);
+    uint32_t ublock_size_tiles = 1;
+
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr);
+
+        cb_wait_front(out_cb, ublock_size_tiles);
+        uint32_t l1_read_addr = get_read_ptr(out_cb);
+        noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes);
+
+        noc_async_write_barrier();
+
+        cb_pop_front(out_cb, ublock_size_tiles);
+        dst_addr += ublock_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp
new file mode 100644
index 00000000000..56be069bed3
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp
@@ -0,0 +1,127 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#define ELTWISE_OP_CODE 0 // TODO(AP): temporary - refactor
+
+#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
+
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api.h"
+
+namespace NAMESPACE {
+
+#ifdef TRISC_MATH
+#include <cstdint>
+#include "llk_math_common.h"
+#include "llk_math_eltwise_binary.h"
+#include "llk_math_eltwise_unary_datacopy.h"
+
+void math_main()
+{
+    uint32_t per_core_num_blocks = get_compile_time_arg_val(0);
+    uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1);
+    uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2);
+
+    llk_math_pack_sync_init<SyncHalf>();
+    for (uint32_t block = 0; block < per_core_num_blocks; block++) {
+        for (uint32_t r = 0; r < per_core_block_r_tiles; r++) {
+            // Untilize
+            llk_math_eltwise_unary_datacopy_init<A2D, BroadcastType::NONE, false>();
+            for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+                llk_math_wait_for_dest_available<SyncHalf>();
+                llk_math_eltwise_unary_datacopy<A2D, BroadcastType::NONE, SyncHalf>(0);
+                llk_math_dest_section_done<SyncHalf>();
+            }
+
+            llk_math_eltwise_binary_init<ELWADD, NONE>();
+            for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+                llk_math_wait_for_dest_available<SyncHalf>();
+                llk_math_eltwise_binary<ELWADD, NONE, SyncHalf, MATH_FIDELITY, false>(0);
+                llk_math_dest_section_done<SyncHalf>();
+            }
+        }
+    }
+}
+#endif
+
+#ifdef TRISC_UNPACK
+#include <cstdint>
+#include "llk_unpack_common.h"
+#include "llk_unpack_AB.h"
+#include "llk_unpack_untilize.h"
+
+void unpack_main()
+{
+uint32_t per_core_num_blocks = get_compile_time_arg_val(0);
+uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1);
+uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2);
+
+llk_setup_operands();
+llk_unpack_AB_hw_configure_disaggregated<BroadcastType::NONE>(0,1);
+// llk_unpack_untilize_hw_configure_disaggregated(0);
+
+// llk_unpack_untilize_init(0);
+for (uint32_t block = 0U; block < per_core_num_blocks; ++block) {
+  for (uint32_t r = 0; r < per_core_block_r_tiles; r++) {
+    llk_unpack_untilize_init(0);
+    llk_wait_tiles(0, per_core_block_c_tiles);
+    llk_unpack_untilize(0, per_core_block_c_tiles);
+    llk_unpack_untilize_uninit(0);
+    llk_pop_tiles(0, per_core_block_c_tiles);
+    llk_pop_tiles(1, per_core_block_c_tiles);
+
+    llk_unpack_AB_init<BroadcastType::NONE>();
+    for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+        llk_wait_tiles(24, 1);
+        llk_wait_tiles(1, 1);
+        llk_unpack_AB(24, 1, 0, 0);
+        llk_pop_tiles(24, 1);
+        llk_pop_tiles(1, 1);
+    }
+  }
+}
+}
+#endif
+
+
+#ifdef TRISC_PACK
+#include <cstdint>
+#include "llk_pack_common.h"
+#include "llk_pack.h"
+
+void pack_main()
+{
+    uint32_t per_core_num_blocks = get_compile_time_arg_val(0);
+    uint32_t per_core_block_r_tiles = get_compile_time_arg_val(1);
+    uint32_t per_core_block_c_tiles = get_compile_time_arg_val(2);
+    llk_pack_init();
+    llk_pack_hw_configure_disaggregated<false>(16);
+    llk_setup_outputs();
+    llk_pack_dest_init<SyncHalf, DstTileFaceLayout::RowMajor, false>();
+
+    for (uint32_t block = 0; block < per_core_num_blocks; block++) {
+        for (uint32_t r = 0; r < per_core_block_r_tiles; r++) {
+            llk_wait_for_free_tiles<false,false,false>(24, per_core_block_c_tiles);
+            for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+                llk_packer_wait_for_math_done();
+                llk_pack<false, SyncHalf, false >(0,24);
+                llk_pack_dest_section_done<SyncHalf>();
+            }
+            llk_push_tiles<false, false>(24, per_core_block_c_tiles);
+
+            llk_wait_for_free_tiles<false,false,false>(16, per_core_block_c_tiles);
+            for (uint32_t c = 0; c < per_core_block_c_tiles; c++) {
+                llk_packer_wait_for_math_done();
+                llk_pack<false, SyncHalf, false >(0,16);
+                llk_pack_dest_section_done<SyncHalf>();
+            }
+            llk_push_tiles<false, false>(16, per_core_block_c_tiles);
+        }
+    }
+}
+#endif
+
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/untilize.cpp
new file mode 100644
index 00000000000..df416389dfc
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/untilize.cpp
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/untilize.h"
+//#include "debug_print.h"
+
+namespace NAMESPACE {
+void MAIN {
+
+    uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
+    uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
+    untilize_init(tt::CB::c_in0);
+
+    //UNPACK(( DPRINT << "Block count=" << uint32_t(per_core_block_cnt) << " tile count=" << per_core_block_tile_cnt << ENDL() ));
+
+    for(uint32_t b = 0; b < per_core_block_cnt; ++ b) {
+        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+
+        untilize_block(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+
+        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+    }
+}
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/update_cache.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/update_cache.cpp
new file mode 100644
index 00000000000..007f4ae618f
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/update_cache.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/common.h"
+#include "compute_kernel_api/untilize.h"
+#include "compute_kernel_api/tilize.h"
+
+
+namespace NAMESPACE {
+void MAIN {
+    constexpr uint32_t onetile = 1;
+
+    constexpr uint32_t cache_cb = get_compile_time_arg_val(0);
+    constexpr uint32_t in_cb = get_compile_time_arg_val(1);
+    constexpr uint32_t untilized_cache_cb = get_compile_time_arg_val(2);
+    constexpr uint32_t untilized_cache2_cb = get_compile_time_arg_val(3);
+    constexpr uint32_t untilized_in_cb = get_compile_time_arg_val(4);
+    constexpr uint32_t out_cb = get_compile_time_arg_val(5);
+    constexpr uint32_t B = get_compile_time_arg_val(6);
+    constexpr uint32_t Wt = get_compile_time_arg_val(7);
+
+    untilize_init(in_cb, untilized_in_cb);
+
+    for (uint32_t  b = 0; b < B / 32; b++) {
+        untilize_init_short(in_cb);
+
+        cb_wait_front(in_cb, Wt);
+        cb_reserve_back(untilized_in_cb, Wt);
+        untilize_block(in_cb, Wt, untilized_in_cb);
+        cb_push_back(untilized_in_cb, Wt);
+        cb_pop_front(in_cb, Wt);
+        untilize_uninit(in_cb);
+
+        for(uint32_t u = 0; u < 32; u++) {
+            untilize_init_short(cache_cb);
+            cb_wait_front(cache_cb, Wt);
+            cb_reserve_back(untilized_cache_cb, Wt);
+            untilize_block(cache_cb, Wt, untilized_cache_cb);
+            cb_push_back(untilized_cache_cb, Wt);
+            cb_pop_front(cache_cb, Wt);
+            untilize_uninit(cache_cb);
+
+            tilize_init_short(untilized_cache2_cb, Wt);
+            cb_wait_front(untilized_cache2_cb, Wt);
+            cb_reserve_back(out_cb, Wt);
+            tilize_block(untilized_cache2_cb, Wt, out_cb);
+            cb_push_back(out_cb, Wt);
+            // Untilized cache CBs share same address space
+            // Compute pops both
+            cb_pop_front(untilized_cache2_cb, Wt);
+            cb_pop_front(untilized_cache_cb, Wt);
+            tilize_uninit();
+        }
+    }
+}
+} // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp
rename to tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp
diff --git a/tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp
rename to tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp
new file mode 100644
index 00000000000..04ba7e3c561
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp
@@ -0,0 +1,8 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+
+void kernel_main() {
+
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/direct_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/test_kernels/direct_reader_unary.cpp
rename to tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp
diff --git a/tests/tt_metal/tt_metal/test_kernels/direct_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/test_kernels/direct_writer_unary.cpp
rename to tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp
new file mode 100644
index 00000000000..c32eedaece1
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+void kernel_main() {
+    std::uint32_t l1_buffer_addr        = get_arg_val<uint32_t>(0);
+
+    std::uint32_t dram_buffer_src_addr  = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src_noc_x        = get_arg_val<uint32_t>(2);
+    std::uint32_t dram_src_noc_y        = get_arg_val<uint32_t>(3);
+
+    std::uint32_t dram_buffer_dst_addr  = get_arg_val<uint32_t>(4);
+    std::uint32_t dram_dst_noc_x        = get_arg_val<uint32_t>(5);
+    std::uint32_t dram_dst_noc_y        = get_arg_val<uint32_t>(6);
+
+    std::uint32_t dram_buffer_size      = get_arg_val<uint32_t>(7);
+
+    // DRAM NOC src address
+    std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+    noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, dram_buffer_size);
+    noc_async_read_barrier();
+
+    // DRAM NOC dst address
+    std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr);
+    noc_async_write(l1_buffer_addr, dram_buffer_dst_noc_addr, dram_buffer_size);
+    noc_async_write_barrier();
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp
new file mode 100644
index 00000000000..411fc6a494d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp
@@ -0,0 +1,101 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdlib>
+#include "dataflow_api.h"
+
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+void kernel_main() {
+    std::uint32_t dram_buffer_src_addr_base   = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_src_noc_x              = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src_noc_y              = get_arg_val<uint32_t>(2);
+
+    std::uint32_t dram_buffer_dst_addr_base   = get_arg_val<uint32_t>(3);
+    std::uint32_t dram_dst_noc_x              = get_arg_val<uint32_t>(4);
+    std::uint32_t dram_dst_noc_y              = get_arg_val<uint32_t>(5);
+
+    std::uint32_t dram_buffer_size            = get_arg_val<uint32_t>(6);
+    std::uint32_t num_tiles                   = get_arg_val<uint32_t>(7);
+
+    std::uint32_t l1_buffer_addr              = get_arg_val<uint32_t>(8);
+    std::uint32_t l1_buffer_size_tiles        = get_arg_val<uint32_t>(9);
+    std::uint32_t l1_buffer_size_bytes        = get_arg_val<uint32_t>(10);
+
+    std::uint32_t rd_wr_l1_buffer_size_tiles = l1_buffer_size_tiles / 2;
+    std::uint32_t rd_wr_l1_buffer_size_bytes = l1_buffer_size_bytes / 2;
+
+    // Keeps track of how many tiles we copied so far
+    std::uint32_t num_tiles_read = 0;
+
+    std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base;
+    std::uint32_t dram_buffer_dst_addr = dram_buffer_dst_addr_base;
+    std::uint64_t dram_buffer_src_noc_addr;
+    std::uint64_t dram_buffer_dst_noc_addr;
+
+    std::uint32_t l1_addr1 = l1_buffer_addr;
+    std::uint32_t l1_addr2 = l1_buffer_addr + rd_wr_l1_buffer_size_bytes;
+
+    // DRAM NOC src address
+    dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+
+    // Copy data from DRAM into destination L1 buffer
+    noc_async_read(
+        dram_buffer_src_noc_addr,
+        l1_addr1,
+        rd_wr_l1_buffer_size_bytes
+    );
+    dram_buffer_src_addr += rd_wr_l1_buffer_size_bytes;
+    num_tiles_read += rd_wr_l1_buffer_size_tiles;
+
+    while (num_tiles_read < num_tiles) {
+        // DRAM NOC src address
+        dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+        // DRAM NOC dst address
+        dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr);
+
+        noc_async_read(
+            dram_buffer_src_noc_addr,
+            l1_addr2,
+            rd_wr_l1_buffer_size_bytes
+        );
+        dram_buffer_src_addr += rd_wr_l1_buffer_size_bytes;
+        num_tiles_read += rd_wr_l1_buffer_size_tiles;
+
+        // Wait all reads flushed (ie received)
+        noc_async_read_barrier();
+
+        noc_async_write(
+            l1_addr1,
+            dram_buffer_dst_noc_addr,
+            rd_wr_l1_buffer_size_bytes
+        );
+
+        dram_buffer_dst_addr += rd_wr_l1_buffer_size_bytes;
+
+        // Wait for all the writes to complete (ie acked)
+        noc_async_write_barrier();
+
+        // Swap L1 addr locations
+        if (num_tiles_read < num_tiles) {
+            std::uint32_t temp_l1_addr = l1_addr1;
+            l1_addr1 = l1_addr2;
+            l1_addr2 = temp_l1_addr;
+        }
+    }
+
+    // DRAM NOC dst address
+    dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr);
+    noc_async_write(
+        l1_addr2,
+        dram_buffer_dst_noc_addr,
+        rd_wr_l1_buffer_size_bytes
+    );
+    // Wait for all the writes to complete (ie acked)
+    noc_async_write_barrier();
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp
new file mode 100644
index 00000000000..91cf24d0ca5
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+void kernel_main() {
+    std::uint32_t l1_buffer_addr        = get_arg_val<uint32_t>(0);
+
+    std::uint32_t dram_buffer_src_addr  = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src_noc_x        = get_arg_val<uint32_t>(2);
+    std::uint32_t dram_src_noc_y        = get_arg_val<uint32_t>(3);
+
+    std::uint32_t num_sticks      = get_arg_val<uint32_t>(4);
+    std::uint32_t stick_size      =  get_arg_val<uint32_t>(5);
+    for(uint32_t i = 0; i < 1; i++) {
+        for(uint32_t stick_id = 0; stick_id < num_sticks; stick_id++) {
+            // DRAM NOC src address
+            std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+            noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, stick_size);
+            noc_async_read_barrier();
+            l1_buffer_addr += stick_size;
+        }
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp
new file mode 100644
index 00000000000..c4817644167
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "hostdevcommon/common_runtime_address_map.h"
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+constexpr static std::uint32_t VALID_VAL = 0x1234;
+constexpr static std::uint32_t INVALID_VAL = 0x4321;
+void kernel_main() {
+    std::uint32_t dram_buffer_src_addr_base         = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_src_noc_x                    = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src_noc_y                    = get_arg_val<uint32_t>(2);
+    std::uint32_t local_buffer_addr                 = get_arg_val<uint32_t>(3);
+    std::uint32_t consumer_core_noc_x               = get_arg_val<uint32_t>(4);
+    std::uint32_t consumer_core_noc_y               = get_arg_val<uint32_t>(5);
+    std::uint32_t stream_register_address           = get_arg_val<uint32_t>(6);
+    std::uint32_t num_tiles                         = get_arg_val<uint32_t>(7);
+    std::uint32_t transient_buffer_size_tiles       = get_arg_val<uint32_t>(8);
+    std::uint32_t transient_buffer_size_bytes       = get_arg_val<uint32_t>(9);
+
+    // Scratch address in L1, to write register value before we copy it to into local/remote registers
+    volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(CONSTANT_REGISTER_VALUE);
+    *(constant_ptr) = VALID_VAL;
+    // Local and remote register addresses (used for sync)
+    std::uint64_t local = get_noc_addr(stream_register_address);
+    std::uint64_t remote = get_noc_addr(consumer_core_noc_x, consumer_core_noc_y, stream_register_address);
+
+    // keeps track of how many tiles we moved so far
+    std::uint32_t counter = 0;
+    std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base;
+    while(counter < num_tiles) {
+        // DRAM NOC src address
+        std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+        // Wait until sync register is INVALID_VAL (means its safe to corrupt destination buffer)
+        wait_for_sync_register_value(stream_register_address, INVALID_VAL);
+        // Copy data from dram into destination buffer
+        noc_async_read(dram_buffer_src_noc_addr, local_buffer_addr, transient_buffer_size_bytes);
+        dram_buffer_src_addr += transient_buffer_size_bytes;
+        // wait all reads flushed (ie received)
+        noc_async_read_barrier();
+
+        // Write VALID_VAL into local register
+        noc_async_write(CONSTANT_REGISTER_VALUE, local, 4);
+        noc_async_write_barrier();
+
+
+        // Write VALID_VAL into remote register
+        noc_async_write(CONSTANT_REGISTER_VALUE, remote, 4);
+        noc_async_write_barrier();
+
+        counter += transient_buffer_size_tiles;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp
new file mode 100644
index 00000000000..dfe6fc7ede0
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "hostdevcommon/common_runtime_address_map.h"
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+constexpr static std::uint32_t VALID_VAL = 0x1234;
+constexpr static std::uint32_t INVALID_VAL = 0x4321;
+
+inline std::uint32_t ping_pong_address(std::uint32_t addr1, std::uint32_t addr2, std::uint32_t index) {
+    if((index & 0x1) == 0) {
+        return addr1;
+    } else {
+        return addr2;
+    }
+}
+void kernel_main() {
+    std::uint32_t dram_buffer_src_addr_base         = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_src_noc_x                    = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src_noc_y                    = get_arg_val<uint32_t>(2);
+    std::uint32_t local_buffer_addr1                = get_arg_val<uint32_t>(3);
+    std::uint32_t local_buffer_addr2                = get_arg_val<uint32_t>(4);
+    std::uint32_t consumer_core_noc_x               = get_arg_val<uint32_t>(5);
+    std::uint32_t consumer_core_noc_y               = get_arg_val<uint32_t>(6);
+    std::uint32_t stream_register_address1          = get_arg_val<uint32_t>(7);
+    std::uint32_t stream_register_address2          = get_arg_val<uint32_t>(8);
+    std::uint32_t num_tiles                         = get_arg_val<uint32_t>(9);
+    std::uint32_t transient_buffer_size_tiles       = get_arg_val<uint32_t>(10);
+    std::uint32_t transient_buffer_size_bytes       = get_arg_val<uint32_t>(11);
+
+    // Scratch address in L1, to write register value before we copy it to into local/remote registers
+    volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(CONSTANT_REGISTER_VALUE);
+    *(constant_ptr) = VALID_VAL;
+
+    // keeps track of how many tiles we moved so far
+    std::uint32_t counter = 0;
+    std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base;
+    std::uint64_t dram_buffer_src_noc_addr;
+    while(counter < num_tiles) {
+        std::uint32_t reg_addr = ping_pong_address(stream_register_address1, stream_register_address2, counter);
+        std::uint64_t local = get_noc_addr(reg_addr);
+        std::uint64_t remote = get_noc_addr(consumer_core_noc_x, consumer_core_noc_y, reg_addr);
+        std::uint32_t local_buffer_address = ping_pong_address(local_buffer_addr1, local_buffer_addr2, counter);
+
+        // DRAM NOC src address
+        dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+        // Wait until sync register is INVALID_VAL (means its safe to corrupt destination buffer)
+        wait_for_sync_register_value(reg_addr, INVALID_VAL);
+        // Copy data from dram into destination buffer
+        noc_async_read(dram_buffer_src_noc_addr, local_buffer_address, transient_buffer_size_bytes);
+        dram_buffer_src_addr += transient_buffer_size_bytes;
+        // wait all reads flushed (ie received)
+        noc_async_read_barrier();
+
+        noc_async_write(CONSTANT_REGISTER_VALUE, local, 4);
+        noc_async_write_barrier();
+        // Write VALID_VAL into remote register
+        noc_async_write(CONSTANT_REGISTER_VALUE, remote, 4);
+        noc_async_write_barrier();
+
+        counter += transient_buffer_size_tiles;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp
new file mode 100644
index 00000000000..f2e2fad994a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src_addr           = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x          = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y          = get_arg_val<uint32_t>(2);
+    uint32_t src_buffer_size    = get_arg_val<uint32_t>(3);
+
+    uint32_t local_addr         = get_arg_val<uint32_t>(4);
+
+    uint32_t dst_addr           = get_arg_val<uint32_t>(5);
+    uint32_t dst_noc_x_start    = get_arg_val<uint32_t>(6);
+    uint32_t dst_noc_y_start    = get_arg_val<uint32_t>(7);
+    uint32_t dst_noc_x_end      = get_arg_val<uint32_t>(8);
+    uint32_t dst_noc_y_end      = get_arg_val<uint32_t>(9);
+    uint32_t num_dests          = get_arg_val<uint32_t>(10);
+
+
+    // Read src buffer into local L1 buffer
+    uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+    noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size);
+    noc_async_read_barrier();
+
+    // multicast local L1 buffer to all destination cores
+    uint64_t dst_noc_multicast_addr = get_noc_multicast_addr(
+        dst_noc_x_start,
+        dst_noc_y_start,
+        dst_noc_x_end,
+        dst_noc_y_end,
+        dst_addr);
+    noc_async_write_multicast(local_addr, dst_noc_multicast_addr, src_buffer_size, num_dests);
+    noc_async_write_barrier();
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp
new file mode 100644
index 00000000000..c1b390934d8
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src_addr           = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x          = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y          = get_arg_val<uint32_t>(2);
+    uint32_t src_buffer_size    = get_arg_val<uint32_t>(3);
+
+    uint32_t local_addr         = get_arg_val<uint32_t>(4);
+
+    uint32_t dst_addr           = get_arg_val<uint32_t>(5);
+    uint32_t dst_noc_x_start    = get_arg_val<uint32_t>(6);
+    uint32_t dst_noc_y_start    = get_arg_val<uint32_t>(7);
+    uint32_t dst_noc_x_end      = get_arg_val<uint32_t>(8);
+    uint32_t dst_noc_y_end      = get_arg_val<uint32_t>(9);
+    uint32_t num_dests          = get_arg_val<uint32_t>(10);
+
+
+    // Read src buffer into local L1 buffer
+    uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+    noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size);
+    noc_async_read_barrier();
+
+    // multicast local L1 buffer to all destination cores
+    uint64_t dst_noc_multicast_addr = get_noc_multicast_addr(
+        dst_noc_x_start,
+        dst_noc_y_start,
+        dst_noc_x_end,
+        dst_noc_y_end,
+        dst_addr);
+    noc_async_write_multicast_loopback_src(local_addr, dst_noc_multicast_addr, src_buffer_size, num_dests);
+    noc_async_write_barrier();
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp
new file mode 100644
index 00000000000..5181692863e
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdlib>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    // Kernel args
+    uint32_t src_addr                      = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x                     = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y                     = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles_r                   = get_arg_val<uint32_t>(3);
+    uint32_t num_tiles_c                   = get_arg_val<uint32_t>(4);
+
+    // How many bytes along a row in the original tensor
+    uint32_t num_bytes_per_tensor_row      = get_arg_val<uint32_t>(5);
+
+    /*
+        Constants
+        Since I am 'constexpr'ing here, I can multiply
+    */
+    constexpr uint32_t cb_id_in0                                    = 0;
+    constexpr uint32_t num_bytes_per_tile_row                       = 64; // 32 bfloat16, each 2 bytes
+    constexpr uint32_t num_bytes_for_sending_eight_tile_rows        = num_bytes_per_tile_row * 8;
+    constexpr uint32_t num_bytes_for_sending_seven_tile_rows        = num_bytes_per_tile_row * 7;
+    constexpr uint32_t num_bytes_for_sending_twenty_four_tile_rows  = num_bytes_per_tile_row * 24;
+    uint32_t num_bytes_per_tile                                     = get_tile_size(cb_id_in0);
+
+    // Variables
+    uint64_t replicate_dest_addr;
+    uint32_t start_dram_addr_offset_for_tensor_row = 0;
+
+    constexpr uint32_t num_elements_in_zeros_buffer = MEM_ZEROS_SIZE / sizeof(uint32_t);
+    volatile tt_l1_ptr uint32_t* zero_base_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(MEM_ZEROS_BASE);
+    for (uint32_t zero_base_offset = 0; zero_base_offset < num_elements_in_zeros_buffer; zero_base_offset++) {
+        *(zero_base_ptr + zero_base_offset) = 0;
+    }
+
+    uint64_t zeros_base_noc_addr = get_noc_addr(MEM_ZEROS_BASE);
+    for (uint32_t i = 0; i < num_tiles_r; i++) {
+        for (uint32_t j = 0; j < 32; j++) {
+            uint32_t src_addr_ = src_addr + start_dram_addr_offset_for_tensor_row;
+            for (uint32_t k = 0; k < num_tiles_c; k++) {
+                cb_reserve_back(cb_id_in0, 1);
+                uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr_);
+
+                // Read one row of data
+                uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+                noc_async_read(src_noc_addr, l1_write_addr, num_bytes_per_tile_row);
+
+                // We move one row down
+                l1_write_addr += num_bytes_per_tile_row;
+
+                /*
+                    Move 31 rows of zeros behind the row that we just moved. We send
+                    8 rows three times, then we send 7 rows
+                */
+                for (uint32_t z = 0; z < 3; z++) {
+                    noc_async_read(zeros_base_noc_addr, l1_write_addr, num_bytes_for_sending_eight_tile_rows);
+                    l1_write_addr += num_bytes_for_sending_eight_tile_rows;
+                }
+
+                noc_async_read(zeros_base_noc_addr, l1_write_addr, num_bytes_for_sending_seven_tile_rows);
+
+                src_addr_ += num_bytes_per_tile;
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, 1);
+
+            } // End num_tiles_c loop
+            start_dram_addr_offset_for_tensor_row += num_bytes_per_tile_row;
+        } // End 32 iter loop
+        start_dram_addr_offset_for_tensor_row += num_bytes_per_tensor_row;
+    } // End num_tiles_r loop
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp
new file mode 100644
index 00000000000..0d53bc3cfa3
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "debug_print.h"
+
+// This kernel is used to read untilized src0 data from DRAM and copy it to L1 in tilized layout.
+// For layout transformation, it uses a list of source addresses (a vector in L1 written by the host) to perform scattered and multiple reads from DRAM.
+// The kernel writes to contiguous location in L1 CB. Therefore, the src addresses must be provided in the order in which tiles are generated.
+// It expects src1 data to already be tilized and it simply copies it to L1.
+void kernel_main() {
+    std::uint32_t dram_buffer_src0_addr  = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_src0_noc_x        = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src0_noc_y        = get_arg_val<uint32_t>(2);
+    std::uint32_t dram_buffer_src1_addr  = get_arg_val<uint32_t>(3);
+    std::uint32_t dram_src1_noc_x        = get_arg_val<uint32_t>(4);
+    std::uint32_t dram_src1_noc_y        = get_arg_val<uint32_t>(5);
+    std::uint32_t address_map_size       = get_arg_val<uint32_t>(6);
+    std::uint32_t address_map_l1_addr    = get_arg_val<uint32_t>(7);
+    std::uint32_t num_blocks             = get_arg_val<uint32_t>(8);
+    std::uint32_t src0_num_reads_per_block = get_arg_val<uint32_t>(9);
+    std::uint32_t src0_dram_read_size_bytes = get_arg_val<uint32_t>(10);
+    std::uint32_t src1_num_bytes_per_block = get_arg_val<uint32_t>(11);
+    std::uint32_t src0_num_tiles_per_block = get_arg_val<uint32_t>(12);
+    std::uint32_t src1_num_tiles_per_block = get_arg_val<uint32_t>(13);
+
+    constexpr uint32_t cb0_id = 0;
+    constexpr uint32_t cb1_id = 1;
+
+    volatile tt_l1_ptr std::uint32_t* source_addresses = (volatile tt_l1_ptr uint32_t*)(address_map_l1_addr);
+
+    uint32_t source_addresses_list_index = 0;
+    // We push one block of tiles of src0 and src1.
+    // src0 and src1 can have different number of tiles per block.
+    for(uint32_t b = 0; b < num_blocks; b+=1) {
+        cb_reserve_back(cb0_id, src0_num_tiles_per_block);
+        cb_reserve_back(cb1_id, src1_num_tiles_per_block);
+        uint32_t l1_write0_addr = get_write_ptr(cb0_id);
+        uint32_t l1_write1_addr = get_write_ptr(cb1_id);
+        std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr(dram_src1_noc_x, dram_src1_noc_y, dram_buffer_src1_addr);
+        // src1 is already tilized in DRAM. Read the whole block of tiles in a single DRAM read access.
+        noc_async_read(dram_buffer_src1_noc_addr, l1_write1_addr, src1_num_bytes_per_block);
+        // src0 is not tilized in DRAM.
+        // For src0, Do multiple DRAM read accesses using addresses provided in "source_addresses" to produce one block of tiles
+        // The source addresses in the list must be in the order of tiles
+        for(uint32_t i = 0; i < src0_num_reads_per_block; i++) {
+             uint32_t src_addr = source_addresses[source_addresses_list_index];
+             std::uint64_t dram_buffer_src0_noc_addr = get_noc_addr(dram_src0_noc_x, dram_src0_noc_y, dram_buffer_src0_addr + src_addr);
+             noc_async_read(dram_buffer_src0_noc_addr, l1_write0_addr, src0_dram_read_size_bytes);
+             l1_write0_addr += src0_dram_read_size_bytes;
+             source_addresses_list_index += 1;
+        }
+        noc_async_read_barrier();
+        dram_buffer_src1_addr += src1_num_bytes_per_block;
+        cb_push_back(cb0_id, src0_num_tiles_per_block);
+        cb_push_back(cb1_id, src1_num_tiles_per_block);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp
new file mode 100644
index 00000000000..495855e0d5d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+void kernel_main() {
+    std::uint32_t dram_buffer_src_addr    = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_src_noc_x        = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src_noc_y        = get_arg_val<uint32_t>(2);
+    std::uint32_t l1_buffer_src_addr_base    = get_arg_val<uint32_t>(3);
+    std::uint32_t l1_buffer_dst_addr_base    = get_arg_val<uint32_t>(4);
+    std::uint32_t l1_dst_noc_x        = get_arg_val<uint32_t>(5);
+    std::uint32_t l1_dst_noc_y        = get_arg_val<uint32_t>(6);
+    std::uint32_t num_tiles                  = get_arg_val<uint32_t>(7);
+    std::uint32_t single_tile_size_bytes     = get_arg_val<uint32_t>(8);
+    std::uint32_t total_tiles_size_bytes     = get_arg_val<uint32_t>(9);
+
+     // DRAM NOC src address
+    std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+    noc_async_read(dram_buffer_src_noc_addr, l1_buffer_src_addr_base, total_tiles_size_bytes);
+    noc_async_read_barrier();
+
+    for(uint32_t i = 0; i < 1000; i++) {
+        // L1 NOC dst address
+        std::uint64_t l1_buffer_dst_noc_addr = get_noc_addr(l1_dst_noc_x, l1_dst_noc_y, l1_buffer_dst_addr_base);
+        noc_async_write(l1_buffer_src_addr_base, l1_buffer_dst_noc_addr, total_tiles_size_bytes);
+        noc_async_write_barrier();
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp
new file mode 100644
index 00000000000..e79f77ff069
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_x = get_arg_val<uint32_t>(5);
+    uint32_t src1_noc_y = get_arg_val<uint32_t>(6);
+    // skip arg 7 for compat with reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    uint32_t tile_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i1 = 0;
+    for (uint32_t i = 0; i < NCHtWt; i += onetile) {
+        uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+        cb_reserve_back(cb_id_in0, onetile);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        noc_async_read(src0_noc_addr, l1_write_addr_in0, tile_bytes);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in0, onetile);
+        src0_addr += tile_bytes;
+
+        // for each W-tile of the first tensor we push one tile from the second arg tile list
+        // but we loop the second list around
+        cb_reserve_back(cb_id_in1, onetile);
+        uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+        noc_async_read(src1_noc_addr, l1_write_addr_in1, tile_bytes);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in1, onetile);
+        i1 ++;
+        src1_addr += tile_bytes;
+        if (i1 == Wt) {
+            // wrap around
+            i1 = 0;
+            src1_addr = get_arg_val<uint32_t>(4);
+        }
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp
new file mode 100644
index 00000000000..697c9253013
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+    uint32_t nc1        = get_arg_val<uint32_t>(12); // if 1 we expect the bcast tensor to have NC=1
+
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat data_format = get_dataformat(cb_id_in0);
+
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i = 0;
+    uint32_t i1 = 0;
+    for (uint32_t nc = 0; nc < NC; nc++) {
+        for (uint32_t ht = 0; ht < Ht; ht++) {
+            for (uint32_t wt = 0; wt < Wt; wt++) {
+                cb_reserve_back(cb_id_in0, onetile);
+                l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                noc_async_read_tile(i, s0, l1_write_addr_in0);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, onetile);
+
+                // for each W-tile of the first tensor we push one tile from the second arg tile list
+                // but we loop the second list around
+                cb_reserve_back(cb_id_in1, onetile);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read_tile(i1, s1, l1_write_addr_in1);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                i1 ++;
+                i ++; // input tile iterates over NC Ht Wt
+            }
+
+            // bcast tensor should be NC1W (actually NC32W padded with 0s in H)
+            // wrap W around for each h (broadcast)
+            i1 -= Wt;
+        }
+        // we reused Wt tiles out of NCWt bcast tensor Ht times, now advance for next NC
+        if (nc1 == 0) // if bcast NC==1 we don't advance but reuse the tensor
+            i1 += Wt;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp
new file mode 100644
index 00000000000..55f2a7154b7
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+    uint32_t nc1        = get_arg_val<uint32_t>(12); // if 1 we expect the bcast tensor to have NC=1 and wrap around in NC
+
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat in1_data_format = get_dataformat(cb_id_in1);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i = 0;
+    uint32_t i1 = 0;
+
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = in1_tile_bytes,
+        .data_format = in1_data_format
+    };
+
+    for (uint32_t nc = 0; nc < NC; nc++) {
+        for (uint32_t ht = 0; ht < Ht; ht++) {
+        for (uint32_t wt = 0; wt < Wt; wt++) {
+            cb_reserve_back(cb_id_in0, onetile);
+            l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+            noc_async_read_tile(i, s0, l1_write_addr_in0);
+            noc_async_read_barrier();
+            cb_push_back(cb_id_in0, onetile);
+
+            // for each H,W-tile of the first tensor we push one tile from the second arg tile list
+            // but we don't advance the second tile index for H,W
+            cb_reserve_back(cb_id_in1, onetile);
+            l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+            noc_async_read_tile(i1, s1, l1_write_addr_in1);
+            noc_async_read_barrier();
+            cb_push_back(cb_id_in1, onetile);
+
+            i ++; // input tile iterates over NC Ht Wt
+        } // wt loop
+        } // ht loop
+        if (nc1 == 0)
+            i1 ++; // bcast-HW tile iterates only for nc loop and only if NC>1
+    } // nc loop
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp
new file mode 100644
index 00000000000..974820cf28b
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_x = get_arg_val<uint32_t>(5);
+    uint32_t src1_noc_y = get_arg_val<uint32_t>(6);
+    // skip arg 7 for compat with reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    uint32_t tile_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i1 = 0;
+    for (uint32_t nc = 0; nc < NC; nc ++ ) {
+        for (uint32_t ht = 0; ht < Ht; ht++ ) {
+            {
+                // only read one tile in H per W-line of tiles
+                // So we push a total of NC*H tiles from src1
+                cb_reserve_back(cb_id_in1, onetile);
+                uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read(src1_noc_addr, l1_write_addr_in1, tile_bytes);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                src1_addr += tile_bytes;
+            }
+
+            for (uint32_t wt = 0; wt < Wt; wt++) {
+                uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+                cb_reserve_back(cb_id_in0, onetile);
+                l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                noc_async_read(src0_noc_addr, l1_write_addr_in0, tile_bytes);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, onetile);
+                src0_addr += tile_bytes;
+            } // Wt loop
+        } // Ht loop
+        src1_addr = get_arg_val<uint32_t>(4); // reset the H-tile ptr
+    } // NC loop
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp
new file mode 100644
index 00000000000..a57865b6016
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp
@@ -0,0 +1,77 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    // skip args 1,2,5,6,7 for compat with single-bank readers and reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+    uint32_t nc1        = get_arg_val<uint32_t>(12); // if 1 we expect the bcast tensor to have NC=1
+
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat in1_data_format = get_dataformat(cb_id_in1);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i = 0;
+    uint32_t i_bcast = 0;
+
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = in1_tile_bytes,
+        .data_format = in1_data_format
+    };
+
+    for (uint32_t nc = 0; nc < NC; nc ++ ) {
+        for (uint32_t ht = 0; ht < Ht; ht++ ) {
+            {
+                // only read one tile in H per W-line of tiles
+                // So we push a total of NC*H tiles from src1
+                cb_reserve_back(cb_id_in1, onetile);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read_tile(i_bcast, s1, l1_write_addr_in1);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                i_bcast++;
+            }
+
+            for (uint32_t wt = 0; wt < Wt; wt++) {
+                cb_reserve_back(cb_id_in0, onetile);
+                l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                noc_async_read_tile(i, s0, l1_write_addr_in0);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, onetile);
+                i++;
+            } // Wt loop
+        } // Ht loop
+
+        if (nc1) // if we also bcast from NC=1, go back Ht tiles on bcasted tensor
+            i_bcast -= Ht;
+    } // NC loop
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp
new file mode 100644
index 00000000000..e21b62ea57d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(3);
+    uint32_t src1_noc_x = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_y = get_arg_val<uint32_t>(5);
+    uint32_t num_tiles  = get_arg_val<uint32_t>(6);
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0);
+    uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1);
+    uint32_t ublock_size_tiles = 1;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker)
+    for (uint32_t i=0; i<num_tiles; i += ublock_size_tiles) {
+        uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+        uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+
+        cb_reserve_back(cb_id_in0, ublock_size_tiles);
+        cb_reserve_back(cb_id_in1, ublock_size_tiles);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        noc_async_read(src0_noc_addr, l1_write_addr_in0, ublock_size_bytes_0);
+        noc_async_read(src1_noc_addr, l1_write_addr_in1, ublock_size_bytes_1);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, ublock_size_tiles);
+        cb_push_back(cb_id_in1, ublock_size_tiles);
+
+        src0_addr += ublock_size_bytes_0;
+        src1_addr += ublock_size_bytes_1;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp
new file mode 100644
index 00000000000..6462b737b3d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_x = get_arg_val<uint32_t>(5);
+    uint32_t src1_noc_y = get_arg_val<uint32_t>(6);
+    uint32_t src1_num_tiles  = get_arg_val<uint32_t>(7);
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0);
+    uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1);
+    uint32_t ublock_size_tiles = 1;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles > src1_num_tiles ? src0_num_tiles : src1_num_tiles;
+
+    // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker)
+    for (uint32_t i=0; i<num_tiles; i += ublock_size_tiles) {
+        if (i < src0_num_tiles) {
+            uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+
+            cb_reserve_back(cb_id_in0, ublock_size_tiles);
+            l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+            noc_async_read(src0_noc_addr, l1_write_addr_in0, ublock_size_bytes_0);
+
+            noc_async_read_barrier();
+
+            cb_push_back(cb_id_in0, ublock_size_tiles);
+
+            src0_addr += ublock_size_bytes_0;
+        }
+
+        if (i < src1_num_tiles) {
+            uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+
+            cb_reserve_back(cb_id_in1, ublock_size_tiles);
+            l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+            noc_async_read(src1_noc_addr, l1_write_addr_in1, ublock_size_bytes_1);
+
+            noc_async_read_barrier();
+
+            cb_push_back(cb_id_in1, ublock_size_tiles);
+
+            src1_addr += ublock_size_bytes_1;
+        }
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bmm_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bmm_8bank.cpp
new file mode 100644
index 00000000000..9326ff057d9
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bmm_8bank.cpp
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+#include "debug_print.h"
+
+void kernel_main() {
+    // same arg indices as in reader_binary_diff_lenghts for compat
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(1);
+    uint32_t Mt         = get_arg_val<uint32_t>(2);
+    uint32_t Kt         = get_arg_val<uint32_t>(3);
+    uint32_t Nt         = get_arg_val<uint32_t>(4);
+    uint32_t MtKt       = get_arg_val<uint32_t>(5); // if 0
+    uint32_t KtNt       = get_arg_val<uint32_t>(6);
+    uint32_t batch      = get_arg_val<uint32_t>(7);
+    uint32_t bcast_B    = get_arg_val<uint32_t>(8); // if 1 we broadcast B to batch
+
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    //DPRINT << "Mt=" << Mt << " Kt=" << Kt << " Nt=" << Nt << " MtKt=" << MtKt << "KtNt=" << KtNt << ENDL();
+    //DPRINT << "src0=" << src0_addr << " src1=" << src1_addr << ENDL();
+    //DPRINT << "batch=" << batch << ENDL();
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    constexpr uint32_t onetile = 1;
+    const uint32_t src0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat src0_data_format = get_dataformat(cb_id_in0);
+    const uint32_t src1_tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat src1_data_format = get_dataformat(cb_id_in1);
+
+    uint32_t itileA_batch = 0;
+    uint32_t itileB_batch = 0;
+
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = src0_tile_bytes,
+        .data_format = src0_data_format
+    };
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = src1_tile_bytes,
+        .data_format = src1_data_format
+    };
+
+    for (uint32_t nb = 0; nb < batch; nb++) {
+        uint32_t itileA = itileA_batch;
+        for (uint32_t mt = 0; mt < Mt; mt++) {
+            uint32_t itileB = itileB_batch;
+            for (uint32_t nt = 0; nt < Nt; nt++) {
+                for (uint32_t kt = 0; kt < Kt; kt++) {
+                    { // Read A's tile at (mt, kt)
+                        cb_reserve_back(cb_id_in0, onetile);
+                        uint32_t l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                        noc_async_read_tile(itileA, s0, l1_write_addr_in0);
+                        noc_async_read_barrier();
+                        cb_push_back(cb_id_in0, onetile);
+                    }
+
+                    { // Read B's tile at (kt, nt)
+                        cb_reserve_back(cb_id_in1, onetile);
+                        uint32_t l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                        noc_async_read_tile(itileB, s1, l1_write_addr_in1);
+                        noc_async_read_barrier();
+                        cb_push_back(cb_id_in1, onetile);
+                    }
+                    //DPRINT << "Pushed itileA=" << itileA << " itileB=" << itileB << ENDL();
+
+                    itileA += 1; // A is MK
+                    itileB += Nt; // B is KN, so to get k++ we stride by Nt
+                } // Kt loop
+                itileB -= KtNt; // revert B to previous state before the K loop (to avoid multiplies)
+                itileB += 1; // B is KN, so here in the end of Nt loop we increment N by 1
+                itileA -= Kt; // resets tileA to kt=0, keep the same mt
+            } // Nt loop
+            itileA += Kt; // A is MK, advance to next M
+        } // Mt loop
+        itileA_batch += MtKt; // update batch strides
+        if (bcast_B == 0) // don't increment batch if we broadcast matrix B
+            itileB_batch += KtNt;
+    } // batch loop
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp
new file mode 100644
index 00000000000..ff94c5ec424
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+inline __attribute__((always_inline))
+void read_and_push_to_cb(const uint32_t cb_id, uint32_t num_tiles_per_cb, uint32_t ublock_size_tiles, uint32_t ublock_size_bytes,
+                               uint32_t dram_src_noc_x, uint32_t dram_src_noc_y, uint32_t& dram_buffer_src_addr) {
+    // read a ublock of tiles at the time from DRAM to L1 buffer, and push a ublock at the time to unpacker
+    for (uint32_t i = 0; i<num_tiles_per_cb ; i += ublock_size_tiles) {
+        // DRAM NOC src address
+        std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+        cb_reserve_back(cb_id, ublock_size_tiles);
+        uint32_t l1_write_addr = get_write_ptr(cb_id);
+
+        noc_async_read(dram_buffer_src_noc_addr, l1_write_addr, ublock_size_bytes);
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id, ublock_size_tiles);
+        dram_buffer_src_addr += ublock_size_bytes;
+    }
+}
+
+void kernel_main() {
+    std::uint32_t dram_buffer_src_addr  = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_src_noc_x        = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src_noc_y        = get_arg_val<uint32_t>(2);
+    std::uint32_t num_tiles_per_cb      = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t cb_id = get_compile_time_arg_val(0);
+    constexpr uint32_t ublock_size_tiles = get_compile_time_arg_val(1);
+    uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles;
+
+    read_and_push_to_cb(cb_id, num_tiles_per_cb, ublock_size_tiles, ublock_size_bytes,
+                              dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp
new file mode 100644
index 00000000000..b8688948cc1
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp
@@ -0,0 +1,70 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    // same arg indices as in reader_binary_diff_lenghts for compat
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    uint32_t src1_num_tiles  = get_arg_val<uint32_t>(7);
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0);
+    uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1);
+    uint32_t ublock_size_tiles = 1;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles > src1_num_tiles ? src0_num_tiles : src1_num_tiles;
+
+    const InterleavedPow2AddrGen<true> s0 = {
+        .bank_base_address = src0_addr,
+
+
+        .log_base_2_of_page_size = 11
+    };
+
+    const InterleavedPow2AddrGen<true> s1 = {
+        .bank_base_address = src1_addr,
+
+
+        .log_base_2_of_page_size = 11
+    };
+
+    // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker)
+    for (uint32_t i=0; i<num_tiles; i += ublock_size_tiles) {
+        if (i < src0_num_tiles) {
+            uint64_t src0_noc_addr = get_noc_addr(i, s0);
+
+            cb_reserve_back(cb_id_in0, ublock_size_tiles);
+            l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+            noc_async_read(src0_noc_addr, l1_write_addr_in0, ublock_size_bytes_0);
+
+            noc_async_read_barrier();
+
+            cb_push_back(cb_id_in0, ublock_size_tiles);
+        }
+
+        if (i < src1_num_tiles) {
+            uint64_t src1_noc_addr = get_noc_addr(i, s1);
+
+            cb_reserve_back(cb_id_in1, ublock_size_tiles);
+            l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+            noc_async_read(src1_noc_addr, l1_write_addr_in1, ublock_size_bytes_1);
+
+            noc_async_read_barrier();
+
+            cb_push_back(cb_id_in1, ublock_size_tiles);
+        }
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp
new file mode 100644
index 00000000000..c20d37afe44
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+// #include "tools/profiler/kernel_profiler.hpp"
+
+void kernel_main() {
+
+    std::uint32_t buffer_src_addr      = get_arg_val<uint32_t>(0);
+    std::uint32_t src_noc_x            = get_arg_val<uint32_t>(1);
+    std::uint32_t src_noc_y            = get_arg_val<uint32_t>(2);
+    std::uint32_t num_tiles            = get_arg_val<uint32_t>(3);
+    std::uint32_t num_repetitions      = get_arg_val<uint32_t>(4);
+
+    constexpr uint32_t cb_id = get_compile_time_arg_val(0);
+    constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1);
+    uint32_t block_size_bytes = get_tile_size(cb_id) * block_size_tiles;
+
+    for (uint32_t j = 0; j < num_repetitions; j++) {
+        uint32_t src_addr = buffer_src_addr;
+        for (uint32_t i = 0; i<num_tiles ; i += block_size_tiles) {
+
+            std::uint64_t buffer_src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+            cb_reserve_back(cb_id, block_size_tiles);
+
+            if (j == 0) {
+                uint32_t l1_write_addr = get_write_ptr(cb_id);
+                noc_async_read(buffer_src_noc_addr, l1_write_addr, block_size_bytes);
+                noc_async_read_barrier();
+            }
+
+            cb_push_back(cb_id, block_size_tiles);
+            src_addr += block_size_bytes;
+
+        }
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp
new file mode 100644
index 00000000000..5bc1e1c0903
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(3);
+    uint32_t src1_noc_x = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_y = get_arg_val<uint32_t>(5);
+    uint32_t num_blocks = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_tile_cnt  = get_arg_val<uint32_t>(7);
+    uint32_t in1_block_tile_cnt  = get_arg_val<uint32_t>(8);
+    uint32_t in0_block_size_bytes  = get_arg_val<uint32_t>(9);
+    uint32_t in1_block_size_bytes  = get_arg_val<uint32_t>(10);
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    for(uint32_t i = 0; i < num_blocks; i++) {
+        uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+        uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+
+        cb_reserve_back(cb_id_in0, in0_block_tile_cnt);
+        cb_reserve_back(cb_id_in1, in1_block_tile_cnt);
+
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        noc_async_read(src0_noc_addr, l1_write_addr_in0, in0_block_size_bytes);
+        noc_async_read(src1_noc_addr, l1_write_addr_in1, in1_block_size_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, in0_block_tile_cnt);
+        cb_push_back(cb_id_in1, in1_block_tile_cnt);
+
+        src0_addr += in0_block_size_bytes;
+        src1_addr += in1_block_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp
new file mode 100644
index 00000000000..572811cb90d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    std::uint32_t dram_buffer_src0_addr  = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_src0_noc_x        = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src0_noc_y        = get_arg_val<uint32_t>(2);
+
+    std::uint32_t dram_buffer_src1_addr  = get_arg_val<uint32_t>(3);
+    std::uint32_t dram_src1_noc_x        = get_arg_val<uint32_t>(4);
+    std::uint32_t dram_src1_noc_y        = get_arg_val<uint32_t>(5);
+
+    std::uint32_t num_tiles              = get_arg_val<uint32_t>(6);
+
+    // single-tile chunks
+    uint32_t chunk_size_bytes_0 = get_tile_size(0);
+    uint32_t chunk_size_bytes_1 = get_tile_size(1);
+    uint32_t chunk_size_tiles = 1;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    // read a chunk of tiles at the time from DRAM to L1 buffer, and push a chunk at the time to unpacker
+    for (uint32_t i=0; i<num_tiles; i += chunk_size_tiles) {
+        // DRAM NOC src address
+        std::uint64_t dram_buffer_src0_noc_addr = get_noc_addr(dram_src0_noc_x, dram_src0_noc_y, dram_buffer_src0_addr);
+        std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr(dram_src1_noc_x, dram_src1_noc_y, dram_buffer_src1_addr);
+
+        cb_reserve_back(0, chunk_size_tiles);
+        cb_reserve_back(1, chunk_size_tiles);
+        l1_write_addr_in0 = get_write_ptr(0);
+        l1_write_addr_in1 = get_write_ptr(1);
+
+        noc_async_read(dram_buffer_src0_noc_addr, l1_write_addr_in0, chunk_size_bytes_0);
+        noc_async_read(dram_buffer_src1_noc_addr, l1_write_addr_in1, chunk_size_bytes_1);
+
+        noc_async_read_barrier();
+
+        cb_push_back(0, chunk_size_tiles);
+        cb_push_back(1, chunk_size_tiles);
+
+        dram_buffer_src0_addr += chunk_size_bytes_0;
+        dram_buffer_src1_addr += chunk_size_bytes_1;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout.cpp
new file mode 100644
index 00000000000..efb6b0962bd
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout.cpp
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+
+
+    bool one_time_profile = true;
+
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // const args for tile-based bank-swizzled layout
+    // could be added to the arg list in the future to test different
+    // bank-swizzling configurations
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id;
+    uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id;
+
+    const InterleavedPow2AddrGen<true> s0 = {
+        .bank_base_address = in0_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    const InterleavedPow2AddrGen<true> s1 = {
+        .bank_base_address = in1_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in0_block_h; h++) {
+            uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in0_block_w; w++) {
+                // kernel_profiler::mark_time(5);
+                uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0);
+                noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes);
+                l1_write_addr_in0 += single_tile_size_bytes;
+                in0_tensor_tile_id += in0_tensor_stride_w;
+            }
+            in0_tensor_row_start_tile_id += in0_tensor_stride_h;
+        }
+        in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride;
+
+
+        uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in1_block_h; h++) {
+            uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in1_block_w; w++) {
+                uint64_t in1_tile_noc_addr = get_noc_addr(in1_tensor_tile_id, s1);
+                noc_async_read(in1_tile_noc_addr, l1_write_addr_in1, single_tile_size_bytes);
+                l1_write_addr_in1 += single_tile_size_bytes;
+                in1_tensor_tile_id += in1_tensor_stride_w;
+            }
+            in1_tensor_row_start_tile_id += in1_tensor_stride_h;
+        }
+        in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride;
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_receiver.cpp
new file mode 100644
index 00000000000..3c260f77ab8
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_receiver.cpp
@@ -0,0 +1,113 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+
+void kernel_main() {
+    // kernel_profiler::mark_time(7);
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // in0 mcast args
+    uint32_t in0_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(17);
+    uint32_t in0_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(18);
+    uint32_t in0_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(19);
+    uint32_t in0_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(20);
+    uint32_t in0_mcast_num_dests                = get_arg_val<uint32_t>(21);
+    uint32_t in0_mcast_sender_noc_x             = get_arg_val<uint32_t>(22);
+    uint32_t in0_mcast_sender_noc_y             = get_arg_val<uint32_t>(23);
+    uint32_t in0_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(24);
+    uint32_t in0_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(25);
+
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_in1);
+
+    uint32_t l1_write_addr_in1;
+
+    uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id;
+
+    volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_receiver_semaphore_addr);
+
+    bool one_time_noc_wait = true;
+    bool one_time_cb_push = true;
+
+    const InterleavedPow2AddrGen<true> s1 = {
+        .bank_base_address = in1_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        // Operand 0
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+
+        // Set in0 semaphore value to INVALID
+        noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+        // Atomic increment source core counter
+        uint64_t in0_mcast_sender_semaphore_noc_addr = get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y, in0_mcast_sender_semaphore_addr);
+        noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1);
+
+        // wait on in0 semaphore value to become VALID (set by mcast sender after it multicasts data)
+        noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID);
+
+        // kernel_profiler::mark_time_once(8, &one_time_noc_wait);
+
+        // Operand 1
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in1_block_h; h++) {
+            uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in1_block_w; w++) {
+                uint64_t in1_tile_noc_addr = get_noc_addr(in1_tensor_tile_id, s1);
+                noc_async_read(in1_tile_noc_addr, l1_write_addr_in1, single_tile_size_bytes);
+                l1_write_addr_in1 += single_tile_size_bytes;
+                in1_tensor_tile_id += in1_tensor_stride_w;
+            }
+            in1_tensor_row_start_tile_id += in1_tensor_stride_h;
+        }
+        in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride;
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+        // kernel_profiler::mark_time_once(9, &one_time_cb_push);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_sender.cpp
new file mode 100644
index 00000000000..85ce6be87ca
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_sender.cpp
@@ -0,0 +1,165 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+
+void kernel_main() {
+    // kernel_profiler::mark_time(10);
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // in0 mcast args
+    uint32_t in0_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(17);
+    uint32_t in0_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(18);
+    uint32_t in0_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(19);
+    uint32_t in0_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(20);
+    uint32_t in0_mcast_num_dests                = get_arg_val<uint32_t>(21);
+    uint32_t in0_mcast_sender_noc_x             = get_arg_val<uint32_t>(22);
+    uint32_t in0_mcast_sender_noc_y             = get_arg_val<uint32_t>(23);
+    uint32_t in0_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(24);
+    uint32_t in0_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(25);
+
+    // const args for tile-based bank-swizzled layout
+    // could be added to the arg list in the future to test different
+    // bank-swizzling configurations
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id;
+    uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id;
+
+    // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted
+    volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_receiver_semaphore_addr);
+    *(in0_mcast_receiver_semaphore_addr_ptr) = VALID;
+    // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready
+    // to receive the mcast
+    volatile tt_l1_ptr uint32_t* in0_mcast_sender_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_sender_semaphore_addr);
+
+    const InterleavedPow2AddrGen<true> s0 = {
+        .bank_base_address = in0_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    const InterleavedPow2AddrGen<true> s1 = {
+        .bank_base_address = in1_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    bool one_time_multicast = true;
+    bool one_time_cb_push = true;
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+        uint32_t in0_start_address = l1_write_addr_in0; // copy start address of block, to be used for mcasting
+        uint32_t in0_block_size_bytes = 0; // can be optimized later, pass it to kernel
+
+        // Copy in0 block into CB, as the default kernel
+        uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in0_block_h; h++) {
+            uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in0_block_w; w++) {
+                uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0);
+                noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes);
+                l1_write_addr_in0 += single_tile_size_bytes;
+                in0_tensor_tile_id += in0_tensor_stride_w;
+                in0_block_size_bytes += single_tile_size_bytes;
+            }
+            in0_tensor_row_start_tile_id += in0_tensor_stride_h;
+        }
+        in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride;
+
+        // Barrier! make sure the reads are done
+        noc_async_read_barrier();
+
+        // wait until all in0 mcast destinations have atomically incremented the in0 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset
+        // the semaphore_addr value back to zero for the next block
+        noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr, in0_mcast_num_dests);
+        noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0);
+
+        // kernel_profiler::mark_time_once(11, &one_time_multicast);
+
+        // Now we have the block in the CB address, we can mcast to dests!
+        uint64_t in0_multicast_data_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_start_x,
+        in0_mcast_dest_noc_start_y,
+        in0_mcast_dest_noc_end_x,
+        in0_mcast_dest_noc_end_y,
+        in0_start_address);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_async_write_multicast(in0_start_address, in0_multicast_data_addr, in0_block_size_bytes, in0_mcast_num_dests);
+        noc_async_write_barrier();
+        // We should also multicast the flag to destinations
+        uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_start_x,
+        in0_mcast_dest_noc_start_y,
+        in0_mcast_dest_noc_end_x,
+        in0_mcast_dest_noc_end_y,
+        in0_mcast_receiver_semaphore_addr);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr, in0_mcast_receiver_semaphore_noc_addr, in0_mcast_num_dests);
+
+        // Copy in1 block into CB, as the default kernel
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in1_block_h; h++) {
+            uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in1_block_w; w++) {
+                uint64_t in1_tile_noc_addr = get_noc_addr(in1_tensor_tile_id, s1);
+                noc_async_read(in1_tile_noc_addr, l1_write_addr_in1, single_tile_size_bytes);
+                l1_write_addr_in1 += single_tile_size_bytes;
+                in1_tensor_tile_id += in1_tensor_stride_w;
+            }
+            in1_tensor_row_start_tile_id += in1_tensor_stride_h;
+        }
+        in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride;
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+        // kernel_profiler::mark_time_once(12, &one_time_cb_push);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_receiver.cpp
new file mode 100644
index 00000000000..7437946d76c
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_receiver.cpp
@@ -0,0 +1,103 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+
+void kernel_main() {
+    // kernel_profiler::mark_time(34);
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // in0 mcast args
+    uint32_t in0_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(17);
+    uint32_t in0_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(18);
+    uint32_t in0_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(19);
+    uint32_t in0_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(20);
+    uint32_t in0_mcast_num_dests                = get_arg_val<uint32_t>(21);
+    uint32_t in0_mcast_sender_noc_x             = get_arg_val<uint32_t>(22);
+    uint32_t in0_mcast_sender_noc_y             = get_arg_val<uint32_t>(23);
+    uint32_t in0_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(24);
+    uint32_t in0_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(25);
+
+    // in1 mcast args
+    uint32_t in1_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(26);
+    uint32_t in1_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(27);
+    uint32_t in1_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(28);
+    uint32_t in1_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(29);
+    uint32_t in1_mcast_num_dests                = get_arg_val<uint32_t>(30);
+    uint32_t in1_mcast_sender_noc_x             = get_arg_val<uint32_t>(31);
+    uint32_t in1_mcast_sender_noc_y             = get_arg_val<uint32_t>(32);
+    uint32_t in1_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(33);
+    uint32_t in1_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(34);
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_receiver_semaphore_addr);
+    volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_receiver_semaphore_addr);
+
+    bool one_time_noc_wait_0 = true;
+    bool one_time_noc_wait_1 = true;
+    bool one_time_cb_push = true;
+
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        // Operand 0
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+
+        // Set in0 semaphore value to INVALID
+        noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+        // Atomic increment source core counter
+        uint64_t in0_mcast_sender_semaphore_noc_addr = get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y, in0_mcast_sender_semaphore_addr);
+        noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1);
+
+        // wait on in0 semaphore value to become VALID (set by mcast sender after it multicasts data)
+        noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID);
+        // kernel_profiler::mark_time_once(35, &one_time_noc_wait_0);
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+
+        // Operand 1
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+
+        // Set in1 semaphore value to INVALID
+        noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+        uint64_t in1_mcast_sender_semaphore_noc_addr = get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y, in1_mcast_sender_semaphore_addr);
+        noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1);
+
+        // wait on in1 semaphore value to become VALID (set by mcast sender after it multicasts data)
+        noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID);
+        // kernel_profiler::mark_time_once(36, &one_time_noc_wait_1);
+
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+        // kernel_profiler::mark_time_once(37, &one_time_cb_push);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_sender.cpp
new file mode 100644
index 00000000000..3f7b50fd881
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_sender.cpp
@@ -0,0 +1,169 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+
+void kernel_main() {
+    // kernel_profiler::mark_time(24);
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // in0 mcast args
+    uint32_t in0_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(17);
+    uint32_t in0_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(18);
+    uint32_t in0_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(19);
+    uint32_t in0_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(20);
+    uint32_t in0_mcast_num_dests                = get_arg_val<uint32_t>(21);
+    uint32_t in0_mcast_sender_noc_x             = get_arg_val<uint32_t>(22);
+    uint32_t in0_mcast_sender_noc_y             = get_arg_val<uint32_t>(23);
+    uint32_t in0_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(24);
+    uint32_t in0_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(25);
+
+    // in1 mcast args
+    uint32_t in1_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(26);
+    uint32_t in1_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(27);
+    uint32_t in1_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(28);
+    uint32_t in1_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(29);
+    uint32_t in1_mcast_num_dests                = get_arg_val<uint32_t>(30);
+    uint32_t in1_mcast_sender_noc_x             = get_arg_val<uint32_t>(31);
+    uint32_t in1_mcast_sender_noc_y             = get_arg_val<uint32_t>(32);
+    uint32_t in1_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(33);
+    uint32_t in1_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(34);
+    // const args for tile-based bank-swizzled layout
+    // could be added to the arg list in the future to test different
+    // bank-swizzling configurations
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t l1_write_addr_in1;
+
+    uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id;
+
+    volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_receiver_semaphore_addr);
+
+    // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted
+    volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_receiver_semaphore_addr);
+    *(in1_mcast_receiver_semaphore_addr_ptr) = VALID;
+    // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready
+    // to receive the mcast
+    volatile tt_l1_ptr uint32_t* in1_mcast_sender_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_sender_semaphore_addr);
+
+    bool one_time_noc_wait_0 = true;
+    bool one_time_noc_wait_1 = true;
+    bool one_time_cb_push = true;
+
+    const InterleavedPow2AddrGen<true> s1 = {
+        .bank_base_address = in1_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        // Operand 0
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+
+        // Set in0 semaphore value to INVALID
+        noc_semaphore_set(in0_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+        // Atomic increment source core counter
+        uint64_t in0_mcast_sender_semaphore_noc_addr = get_noc_addr(in0_mcast_sender_noc_x, in0_mcast_sender_noc_y, in0_mcast_sender_semaphore_addr);
+        noc_semaphore_inc(in0_mcast_sender_semaphore_noc_addr, 1);
+
+        // wait on in0 semaphore value to become VALID (set by mcast sender after it multicasts data)
+        noc_semaphore_wait(in0_mcast_receiver_semaphore_addr_ptr, VALID);
+        // kernel_profiler::mark_time_once(25, &one_time_noc_wait_0);
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+
+        // Operand 1
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        uint32_t in1_start_address = l1_write_addr_in1; // copy start address of block, to be used for mcasting
+        uint32_t in1_block_size_bytes = 0; // can be optimized later, pass it to kernel
+
+        // Copy in1 block into CB, as the default kernel
+        uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in1_block_h; h++) {
+            uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in1_block_w; w++) {
+                uint64_t in1_tile_noc_address = get_noc_addr(in1_tensor_tile_id, s1);
+                noc_async_read(in1_tile_noc_address, l1_write_addr_in1, single_tile_size_bytes);
+                l1_write_addr_in1 += single_tile_size_bytes;
+                in1_tensor_tile_id += in1_tensor_stride_w;
+                in1_block_size_bytes += single_tile_size_bytes;
+            }
+            in1_tensor_row_start_tile_id += in1_tensor_stride_h;
+        }
+        in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride;
+
+        // Barrier! make sure the reads are done
+        noc_async_read_barrier();
+
+        // wait until all in1 mcast destinations have atomically incremented the in1 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset
+        // the semaphore_addr value back to zero for the next block
+        noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr, in1_mcast_num_dests);
+        noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0);
+        // kernel_profiler::mark_time_once(26, &one_time_noc_wait_1);
+
+        // Now we have the block in the CB address, we can mcast to dests!
+        uint64_t in1_multicast_data_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_start_x,
+        in1_mcast_dest_noc_start_y,
+        in1_mcast_dest_noc_end_x,
+        in1_mcast_dest_noc_end_y,
+        in1_start_address);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_async_write_multicast(in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_dests);
+
+        // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same cmd_buf
+        // Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
+
+        // We should also multicast the flag to destinations
+        uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_start_x,
+        in1_mcast_dest_noc_start_y,
+        in1_mcast_dest_noc_end_x,
+        in1_mcast_dest_noc_end_y,
+        in1_mcast_receiver_semaphore_addr);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr, in1_mcast_receiver_semaphore_noc_addr, in1_mcast_num_dests);
+
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+        // kernel_profiler::mark_time_once(27, &one_time_cb_push);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_receiver.cpp
new file mode 100644
index 00000000000..dd29b87ec0f
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_receiver.cpp
@@ -0,0 +1,169 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+
+void kernel_main() {
+    // kernel_profiler::mark_time(39);
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // in0 mcast args
+    uint32_t in0_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(17);
+    uint32_t in0_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(18);
+    uint32_t in0_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(19);
+    uint32_t in0_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(20);
+    uint32_t in0_mcast_num_dests                = get_arg_val<uint32_t>(21);
+    uint32_t in0_mcast_sender_noc_x             = get_arg_val<uint32_t>(22);
+    uint32_t in0_mcast_sender_noc_y             = get_arg_val<uint32_t>(23);
+    uint32_t in0_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(24);
+    uint32_t in0_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(25);
+
+    // in1 mcast args
+    uint32_t in1_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(26);
+    uint32_t in1_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(27);
+    uint32_t in1_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(28);
+    uint32_t in1_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(29);
+    uint32_t in1_mcast_num_dests                = get_arg_val<uint32_t>(30);
+    uint32_t in1_mcast_sender_noc_x             = get_arg_val<uint32_t>(31);
+    uint32_t in1_mcast_sender_noc_y             = get_arg_val<uint32_t>(32);
+    uint32_t in1_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(33);
+    uint32_t in1_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(34);
+
+    // const args for tile-based bank-swizzled layout
+    // could be added to the arg list in the future to test different
+    // bank-swizzling configurations
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t l1_write_addr_in0;
+
+    uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id;
+
+    // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted
+    volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_receiver_semaphore_addr);
+    *(in0_mcast_receiver_semaphore_addr_ptr) = VALID;
+    // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready
+    // to receive the mcast
+    volatile tt_l1_ptr uint32_t* in0_mcast_sender_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_sender_semaphore_addr);
+
+
+    volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_receiver_semaphore_addr);
+
+    bool one_time_noc_wait_0 = true;
+    bool one_time_noc_wait_1 = true;
+    bool one_time_cb_push = true;
+
+     const InterleavedPow2AddrGen<true> s0 = {
+        .bank_base_address = in0_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+        uint32_t in0_start_address = l1_write_addr_in0; // copy start address of block, to be used for mcasting
+        uint32_t in0_block_size_bytes = 0; // can be optimized later, pass it to kernel
+
+        // Copy in0 block into CB, as the default kernel
+        uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in0_block_h; h++) {
+            uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in0_block_w; w++) {
+                uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0);
+                noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes);
+                l1_write_addr_in0 += single_tile_size_bytes;
+                in0_tensor_tile_id += in0_tensor_stride_w;
+                in0_block_size_bytes += single_tile_size_bytes;
+            }
+            in0_tensor_row_start_tile_id += in0_tensor_stride_h;
+        }
+        in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride;
+
+        // Barrier! make sure the reads are done
+        noc_async_read_barrier();
+
+        // wait until all in0 mcast destinations have atomically incremented the in0 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset
+        // the semaphore_addr value back to zero for the next block
+        noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr, in0_mcast_num_dests);
+        noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0);
+        // kernel_profiler::mark_time_once(40, &one_time_noc_wait_0);
+        // Now we have the block in the CB address, we can mcast to dests!
+        uint64_t in0_multicast_data_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_end_x,
+        in0_mcast_dest_noc_end_y,
+        in0_mcast_dest_noc_start_x,
+        in0_mcast_dest_noc_start_y,
+        in0_start_address);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_async_write_multicast(in0_start_address, in0_multicast_data_addr, in0_block_size_bytes, in0_mcast_num_dests);
+
+        // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same cmd_buf
+        // Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
+
+        // We should also multicast the flag to destinations
+        uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_end_x,
+        in0_mcast_dest_noc_end_y,
+        in0_mcast_dest_noc_start_x,
+        in0_mcast_dest_noc_start_y,
+        in0_mcast_receiver_semaphore_addr);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr, in0_mcast_receiver_semaphore_noc_addr, in0_mcast_num_dests);
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+
+
+        // Operand 1
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+
+        // Set in1 semaphore value to INVALID
+        noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+        uint64_t in1_mcast_sender_semaphore_noc_addr = get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y, in1_mcast_sender_semaphore_addr);
+        noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1);
+
+        // wait on in1 semaphore value to become VALID (set by mcast sender after it multicasts data)
+        noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID);
+        // kernel_profiler::mark_time_once(41, &one_time_noc_wait_1);
+
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+        // kernel_profiler::mark_time_once(42, &one_time_cb_push);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_sender.cpp
new file mode 100644
index 00000000000..80896331ed9
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_sender.cpp
@@ -0,0 +1,226 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+
+void kernel_main() {
+    // kernel_profiler::mark_time(29);
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // in0 mcast args
+    uint32_t in0_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(17);
+    uint32_t in0_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(18);
+    uint32_t in0_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(19);
+    uint32_t in0_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(20);
+    uint32_t in0_mcast_num_dests                = get_arg_val<uint32_t>(21);
+    uint32_t in0_mcast_sender_noc_x             = get_arg_val<uint32_t>(22);
+    uint32_t in0_mcast_sender_noc_y             = get_arg_val<uint32_t>(23);
+    uint32_t in0_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(24);
+    uint32_t in0_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(25);
+
+    // in1 mcast args
+    uint32_t in1_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(26);
+    uint32_t in1_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(27);
+    uint32_t in1_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(28);
+    uint32_t in1_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(29);
+    uint32_t in1_mcast_num_dests                = get_arg_val<uint32_t>(30);
+    uint32_t in1_mcast_sender_noc_x             = get_arg_val<uint32_t>(31);
+    uint32_t in1_mcast_sender_noc_y             = get_arg_val<uint32_t>(32);
+    uint32_t in1_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(33);
+    uint32_t in1_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(34);
+
+
+    // const args for tile-based bank-swizzled layout
+    // could be added to the arg list in the future to test different
+    // bank-swizzling configurations
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id;
+    uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id;
+
+    // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted
+    volatile tt_l1_ptr uint32_t* in0_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_receiver_semaphore_addr);
+    *(in0_mcast_receiver_semaphore_addr_ptr) = VALID;
+
+    // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted
+    volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_receiver_semaphore_addr);
+    *(in1_mcast_receiver_semaphore_addr_ptr) = VALID;
+    // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready
+    // to receive the mcast
+    volatile tt_l1_ptr uint32_t* in0_mcast_sender_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_sender_semaphore_addr);
+
+    // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready
+    // to receive the mcast
+    volatile tt_l1_ptr uint32_t* in1_mcast_sender_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_sender_semaphore_addr);
+
+    bool one_time_noc_wait_0 = true;
+    bool one_time_noc_wait_1 = true;
+    bool one_time_cb_push = true;
+
+    const InterleavedPow2AddrGen<true> s0 = {
+        .bank_base_address = in0_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    const InterleavedPow2AddrGen<true> s1 = {
+        .bank_base_address = in1_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+        uint32_t in0_start_address = l1_write_addr_in0; // copy start address of block, to be used for mcasting
+        uint32_t in0_block_size_bytes = 0; // can be optimized later, pass it to kernel
+
+        // Copy in0 block into CB, as the default kernel
+        uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in0_block_h; h++) {
+            uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in0_block_w; w++) {
+                uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0);
+                noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes);
+                l1_write_addr_in0 += single_tile_size_bytes;
+                in0_tensor_tile_id += in0_tensor_stride_w;
+                in0_block_size_bytes += single_tile_size_bytes;
+            }
+            in0_tensor_row_start_tile_id += in0_tensor_stride_h;
+        }
+        in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride;
+
+        // Barrier! make sure the reads are done
+        noc_async_read_barrier();
+
+        // wait until all in0 mcast destinations have atomically incremented the in0 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset
+        // the semaphore_addr value back to zero for the next block
+        noc_semaphore_wait(in0_mcast_sender_semaphore_addr_ptr, in0_mcast_num_dests);
+        noc_semaphore_set(in0_mcast_sender_semaphore_addr_ptr, 0);
+        // kernel_profiler::mark_time_once(30, &one_time_noc_wait_0);
+
+        // Now we have the block in the CB address, we can mcast to dests!
+        uint64_t in0_multicast_data_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_end_x,
+        in0_mcast_dest_noc_end_y,
+        in0_mcast_dest_noc_start_x,
+        in0_mcast_dest_noc_start_y,
+        in0_start_address);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_async_write_multicast(in0_start_address, in0_multicast_data_addr, in0_block_size_bytes, in0_mcast_num_dests);
+
+        // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same cmd_buf
+        // Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
+
+        // We should also multicast the flag to destinations
+        uint64_t in0_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in0_mcast_dest_noc_end_x,
+        in0_mcast_dest_noc_end_y,
+        in0_mcast_dest_noc_start_x,
+        in0_mcast_dest_noc_start_y,
+        in0_mcast_receiver_semaphore_addr);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_semaphore_set_multicast(in0_mcast_receiver_semaphore_addr, in0_mcast_receiver_semaphore_noc_addr, in0_mcast_num_dests);
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+
+        // Operand 1
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        uint32_t in1_start_address = l1_write_addr_in1; // copy start address of block, to be used for mcasting
+        uint32_t in1_block_size_bytes = 0; // can be optimized later, pass it to kernel
+
+        // Copy in1 block into CB, as the default kernel
+        uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in1_block_h; h++) {
+            uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in1_block_w; w++) {
+                uint64_t in1_tile_noc_address = get_noc_addr(in1_tensor_tile_id, s1);
+                noc_async_read(in1_tile_noc_address, l1_write_addr_in1, single_tile_size_bytes);
+                l1_write_addr_in1 += single_tile_size_bytes;
+                in1_tensor_tile_id += in1_tensor_stride_w;
+                in1_block_size_bytes += single_tile_size_bytes;
+            }
+            in1_tensor_row_start_tile_id += in1_tensor_stride_h;
+        }
+        in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride;
+
+        // Barrier! make sure the reads are done
+        noc_async_read_barrier();
+
+        // wait until all in1 mcast destinations have atomically incremented the in1 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset
+        // the semaphore_addr value back to zero for the next block
+        noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr, in1_mcast_num_dests);
+        noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0);
+        // kernel_profiler::mark_time_once(31, &one_time_noc_wait_1);
+
+        // Now we have the block in the CB address, we can mcast to dests!
+        uint64_t in1_multicast_data_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_end_x,
+        in1_mcast_dest_noc_end_y,
+        in1_mcast_dest_noc_start_x,
+        in1_mcast_dest_noc_start_y,
+        in1_start_address);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_async_write_multicast(in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_dests);
+
+        // Note: no need for write barrier, since these two multicasts are done on the same noc id, same vc, same cmd_buf
+        // Also, this only works because we are setting VCs statically (using NOC_CMD_STATIC_VC).
+
+        // We should also multicast the flag to destinations
+        uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_end_x,
+        in1_mcast_dest_noc_end_y,
+        in1_mcast_dest_noc_start_x,
+        in1_mcast_dest_noc_start_y,
+        in1_mcast_receiver_semaphore_addr);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr, in1_mcast_receiver_semaphore_noc_addr, in1_mcast_num_dests);
+
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+        // kernel_profiler::mark_time_once(32, &one_time_cb_push);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_receiver.cpp
new file mode 100644
index 00000000000..d1ed40fd2f4
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_receiver.cpp
@@ -0,0 +1,110 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+
+void kernel_main() {
+    // kernel_profiler::mark_time(16);
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // in0 mcast args
+    uint32_t in1_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(17);
+    uint32_t in1_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(18);
+    uint32_t in1_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(19);
+    uint32_t in1_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(20);
+    uint32_t in1_mcast_num_dests                = get_arg_val<uint32_t>(21);
+    uint32_t in1_mcast_sender_noc_x             = get_arg_val<uint32_t>(22);
+    uint32_t in1_mcast_sender_noc_y             = get_arg_val<uint32_t>(23);
+    uint32_t in1_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(24);
+    uint32_t in1_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(25);
+
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_in1);
+
+    uint32_t l1_write_addr_in0;
+
+    uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id;
+    volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_receiver_semaphore_addr);
+
+    bool one_time_noc_wait = true;
+    bool one_time_cb_push = true;
+
+    const InterleavedPow2AddrGen<true> s0 = {
+        .bank_base_address = in0_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        // Operand 0
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+        uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in0_block_h; h++) {
+            uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in0_block_w; w++) {
+                uint64_t in0_tile_noc_addr = get_noc_addr(in0_tensor_tile_id, s0);
+                noc_async_read(in0_tile_noc_addr, l1_write_addr_in0, single_tile_size_bytes);
+                l1_write_addr_in0 += single_tile_size_bytes;
+                in0_tensor_tile_id += in0_tensor_stride_w;
+            }
+            in0_tensor_row_start_tile_id += in0_tensor_stride_h;
+        }
+        in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride;
+
+        noc_async_read_barrier();
+
+        // Operand 1
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+
+        // Set in0 semaphore value to INVALID
+        noc_semaphore_set(in1_mcast_receiver_semaphore_addr_ptr, INVALID);
+
+        uint64_t in1_mcast_sender_semaphore_noc_addr = get_noc_addr(in1_mcast_sender_noc_x, in1_mcast_sender_noc_y, in1_mcast_sender_semaphore_addr);
+        noc_semaphore_inc(in1_mcast_sender_semaphore_noc_addr, 1);
+
+        // wait on in0 semaphore value to become VALID (set by mcast sender after it multicasts data)
+        noc_semaphore_wait(in1_mcast_receiver_semaphore_addr_ptr, VALID);
+        // kernel_profiler::mark_time_once(17, &one_time_noc_wait);
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+        // kernel_profiler::mark_time_once(18, &one_time_cb_push);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_sender.cpp
new file mode 100644
index 00000000000..e2a14e90e00
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_sender.cpp
@@ -0,0 +1,166 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+
+void kernel_main() {
+    // kernel_profiler::mark_time(20);
+    // in0 tensor args
+    uint32_t in0_tensor_addr                    = get_arg_val<uint32_t>(0);
+    uint32_t in0_tensor_start_tile_id           = get_arg_val<uint32_t>(1);
+    uint32_t in0_tensor_stride_w                = get_arg_val<uint32_t>(2);
+    uint32_t in0_tensor_stride_h                = get_arg_val<uint32_t>(3);
+    uint32_t in0_tensor_next_block_stride       = get_arg_val<uint32_t>(4);
+
+    // in0 block args
+    uint32_t in0_block_w                        = get_arg_val<uint32_t>(5);
+    uint32_t in0_block_h                        = get_arg_val<uint32_t>(6);
+    uint32_t in0_block_num_tiles                = get_arg_val<uint32_t>(7);
+
+    // in1 tensor args
+    uint32_t in1_tensor_addr                    = get_arg_val<uint32_t>(8);
+    uint32_t in1_tensor_start_tile_id           = get_arg_val<uint32_t>(9);
+    uint32_t in1_tensor_stride_w                = get_arg_val<uint32_t>(10);
+    uint32_t in1_tensor_stride_h                = get_arg_val<uint32_t>(11);
+    uint32_t in1_tensor_next_block_stride       = get_arg_val<uint32_t>(12);
+
+    // in1 block args
+    uint32_t in1_block_w                        = get_arg_val<uint32_t>(13);
+    uint32_t in1_block_h                        = get_arg_val<uint32_t>(14);
+    uint32_t in1_block_num_tiles                = get_arg_val<uint32_t>(15);
+
+    // in0/in1 common args
+    uint32_t num_blocks                         = get_arg_val<uint32_t>(16);
+
+    // in0 mcast args
+    uint32_t in1_mcast_dest_noc_start_x         = get_arg_val<uint32_t>(17);
+    uint32_t in1_mcast_dest_noc_start_y         = get_arg_val<uint32_t>(18);
+    uint32_t in1_mcast_dest_noc_end_x           = get_arg_val<uint32_t>(19);
+    uint32_t in1_mcast_dest_noc_end_y           = get_arg_val<uint32_t>(20);
+    uint32_t in1_mcast_num_dests                = get_arg_val<uint32_t>(21);
+    uint32_t in1_mcast_sender_noc_x             = get_arg_val<uint32_t>(22);
+    uint32_t in1_mcast_sender_noc_y             = get_arg_val<uint32_t>(23);
+    uint32_t in1_mcast_sender_semaphore_addr    = get_arg_val<uint32_t>(24);
+    uint32_t in1_mcast_receiver_semaphore_addr  = get_arg_val<uint32_t>(25);
+
+    // const args for tile-based bank-swizzled layout
+    // could be added to the arg list in the future to test different
+    // bank-swizzling configurations
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    const InterleavedPow2AddrGen<true> s0 = {
+        .bank_base_address = in0_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    const InterleavedPow2AddrGen<true> s1 = {
+        .bank_base_address = in1_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t in0_tensor_current_block_start_tile_id = in0_tensor_start_tile_id;
+    uint32_t in1_tensor_current_block_start_tile_id = in1_tensor_start_tile_id;
+
+    // Set ur local VALID value, to be mcasted to destinations flag address after the data has been mcasted
+    volatile tt_l1_ptr uint32_t* in1_mcast_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_receiver_semaphore_addr);
+    *(in1_mcast_receiver_semaphore_addr_ptr) = VALID;
+    // local address that will be atomically incremented by mcast receivers, to know when all receivers are ready
+    // to receive the mcast
+    volatile tt_l1_ptr uint32_t* in1_mcast_sender_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in1_mcast_sender_semaphore_addr);
+
+    bool one_time_noc_wait = true;
+    bool one_time_cb_push = true;
+
+    for(uint32_t b = 0; b < num_blocks; b++) {
+        cb_reserve_back(cb_id_in0, in0_block_num_tiles);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+        // Copy in0 block into CB, as the default kernel
+        uint32_t in0_tensor_row_start_tile_id = in0_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in0_block_h; h++) {
+            uint32_t in0_tensor_tile_id = in0_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in0_block_w; w++) {
+                uint64_t in0_tile_noc_address = get_noc_addr(in0_tensor_tile_id, s0);
+                noc_async_read(in0_tile_noc_address, l1_write_addr_in0, single_tile_size_bytes);
+                l1_write_addr_in0 += single_tile_size_bytes;
+                in0_tensor_tile_id += in0_tensor_stride_w;
+            }
+            in0_tensor_row_start_tile_id += in0_tensor_stride_h;
+        }
+        in0_tensor_current_block_start_tile_id += in0_tensor_next_block_stride;
+
+        // Barrier! make sure the reads are done
+        noc_async_read_barrier();
+
+
+        // Operand 1
+        cb_reserve_back(cb_id_in1, in1_block_num_tiles);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        uint32_t in1_start_address = l1_write_addr_in1; // copy start address of block, to be used for mcasting
+        uint32_t in1_block_size_bytes = 0; // can be optimized later, pass it to kernel
+
+        uint32_t in1_tensor_row_start_tile_id = in1_tensor_current_block_start_tile_id;
+        for(uint32_t h = 0; h < in1_block_h; h++) {
+            uint32_t in1_tensor_tile_id = in1_tensor_row_start_tile_id;
+            for(uint32_t w = 0; w < in1_block_w; w++) {
+                uint64_t in1_tile_noc_addr = get_noc_addr(in1_tensor_tile_id, s1);
+                noc_async_read(in1_tile_noc_addr, l1_write_addr_in1, single_tile_size_bytes);
+                l1_write_addr_in1 += single_tile_size_bytes;
+                in1_tensor_tile_id += in1_tensor_stride_w;
+                in1_block_size_bytes += single_tile_size_bytes;
+            }
+            in1_tensor_row_start_tile_id += in1_tensor_stride_h;
+        }
+        in1_tensor_current_block_start_tile_id += in1_tensor_next_block_stride;
+
+        noc_async_read_barrier();
+
+        // wait until all in1 mcast destinations have atomically incremented the in1 semaphore_addr (i.e. its value should be in0_mcast_num_dests), then reset
+        // the semaphore_addr value back to zero for the next block
+        noc_semaphore_wait(in1_mcast_sender_semaphore_addr_ptr, in1_mcast_num_dests);
+        noc_semaphore_set(in1_mcast_sender_semaphore_addr_ptr, 0);
+        // kernel_profiler::mark_time_once(21, &one_time_noc_wait);
+
+        // Now we have the block in the CB address, we can mcast to dests!
+        uint64_t in1_multicast_data_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_start_x,
+        in1_mcast_dest_noc_start_y,
+        in1_mcast_dest_noc_end_x,
+        in1_mcast_dest_noc_end_y,
+        in1_start_address);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_async_write_multicast(in1_start_address, in1_multicast_data_addr, in1_block_size_bytes, in1_mcast_num_dests);
+        noc_async_write_barrier();
+        // We should also multicast the flag to destinations
+        uint64_t in1_mcast_receiver_semaphore_noc_addr = get_noc_multicast_addr(
+        in1_mcast_dest_noc_start_x,
+        in1_mcast_dest_noc_start_y,
+        in1_mcast_dest_noc_end_x,
+        in1_mcast_dest_noc_end_y,
+        in1_mcast_receiver_semaphore_addr);
+        // num_dests must not include source, since we are NOT really doing a local copy!
+        noc_semaphore_set_multicast(in1_mcast_receiver_semaphore_addr, in1_mcast_receiver_semaphore_noc_addr, in1_mcast_num_dests);
+
+        cb_push_back(cb_id_in0, in0_block_num_tiles);
+        cb_push_back(cb_id_in1, in1_block_num_tiles);
+        // kernel_profiler::mark_time_once(22, &one_time_cb_push);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp
new file mode 100644
index 00000000000..a8861beac39
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr            = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x           = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y           = get_arg_val<uint32_t>(2);
+    uint32_t src1_addr            = get_arg_val<uint32_t>(3);
+    uint32_t src1_noc_x           = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_y           = get_arg_val<uint32_t>(5);
+    uint32_t num_blocks           = get_arg_val<uint32_t>(6);
+
+    uint32_t in0_block_tile_cnt   = get_arg_val<uint32_t>(7);
+    uint32_t in1_block_tile_cnt   = get_arg_val<uint32_t>(8);
+    uint32_t in0_block_size_bytes = get_arg_val<uint32_t>(9);
+    uint32_t in1_block_size_bytes = get_arg_val<uint32_t>(10);
+
+    uint32_t with_bias            = get_arg_val<uint32_t>(11);
+    uint32_t src2_addr;
+    uint32_t src2_noc_x;
+    uint32_t src2_noc_y;
+    uint32_t in2_block_tile_cnt;
+    uint32_t in2_block_size_bytes;
+
+    if (with_bias) {
+        src2_addr            = get_arg_val<uint32_t>(12);
+        src2_noc_x           = get_arg_val<uint32_t>(13);
+        src2_noc_y           = get_arg_val<uint32_t>(14);
+        in2_block_tile_cnt   = get_arg_val<uint32_t>(15);
+        in2_block_size_bytes = get_arg_val<uint32_t>(16);
+    }
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t cb_id_in2 = 2;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+    uint32_t l1_write_addr_in2;
+
+    for(uint32_t i = 0; i < num_blocks; i++) {
+        uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+        uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+
+        cb_reserve_back(cb_id_in0, in0_block_tile_cnt);
+        cb_reserve_back(cb_id_in1, in1_block_tile_cnt);
+
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+        noc_async_read(src0_noc_addr, l1_write_addr_in0, in0_block_size_bytes);
+        noc_async_read(src1_noc_addr, l1_write_addr_in1, in1_block_size_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, in0_block_tile_cnt);
+        cb_push_back(cb_id_in1, in1_block_tile_cnt);
+
+        src0_addr += in0_block_size_bytes;
+        src1_addr += in1_block_size_bytes;
+    }
+
+    if (with_bias) {
+        uint64_t src2_noc_addr = get_noc_addr(src2_noc_x, src2_noc_y, src2_addr);
+        l1_write_addr_in2 = get_write_ptr(cb_id_in2);
+        cb_reserve_back(cb_id_in2, in2_block_tile_cnt);
+        noc_async_read(src2_noc_addr, l1_write_addr_in2, in2_block_size_bytes);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in2, in2_block_tile_cnt);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_nary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_nary.cpp
new file mode 100644
index 00000000000..b2135f798f6
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_nary.cpp
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+// Make n reads defined by num_reads
+// Writes to Specified Circular Buffers in L1
+// Expects n provided src_addr, src_noc_x, src_noc_y, and cb_id_in
+void kernel_main() {
+    uint32_t num_reads = get_arg_val<uint32_t>(0);
+    uint32_t num_tiles_per_read = get_arg_val<uint32_t>(1);
+
+    // ublocks size defined in tiles
+    constexpr uint32_t ublock_size_tiles = 1;
+
+    for (uint32_t i = 0; i<num_reads; i++){
+        uint32_t src_addr  = get_arg_val<uint32_t>(2 + i * 4);
+        uint32_t src_noc_x = get_arg_val<uint32_t>(3 + i * 4);
+        uint32_t src_noc_y = get_arg_val<uint32_t>(4 + i * 4);
+        uint32_t cb_id_in  = get_arg_val<uint32_t>(5 + i * 4);
+
+        uint32_t ublock_size_bytes = get_tile_size(cb_id_in);
+
+        // read a ublock of tiles from src to CB, and then push the ublock to unpacker
+        for (uint32_t i = 0; i<num_tiles_per_read; i += ublock_size_tiles) {
+            uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+
+            cb_reserve_back(cb_id_in, ublock_size_tiles);
+            uint32_t l1_write_addr = get_write_ptr(cb_id_in);
+
+            noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes);
+
+            noc_async_read_barrier();
+
+            cb_push_back(cb_id_in, ublock_size_tiles);
+            src_addr += ublock_size_bytes;
+        }
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp
new file mode 100644
index 00000000000..bf5b77d8f60
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t cb_id_in0 = 0;
+
+    // ublocks size defined in tiles
+    constexpr uint32_t ublock_size_tiles = 1;
+    uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles;
+
+    // read a ublock of tiles from src to CB, and then push the ublock to unpacker
+    for (uint32_t i = 0; i<num_tiles; i += ublock_size_tiles) {
+        uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+
+        cb_reserve_back(cb_id_in0, ublock_size_tiles);
+        uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+        noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, ublock_size_tiles);
+        src_addr += ublock_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank.cpp
new file mode 100644
index 00000000000..6d88622ed5a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank.cpp
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+//#include "debug_print.h"
+
+void generate_bcast_scaler() {
+    constexpr uint32_t cb_in_2 = 2;
+    uint32_t scaler = get_arg_val<uint32_t>(8);
+    union { float f; uint32_t u; } u; u.u = scaler;
+    //DPRINT << "basic Scaler = " << F32(u.f) << ENDL();
+    cb_reserve_back(cb_in_2, 1);
+    auto ptr = reinterpret_cast<uint16_t*>(get_write_ptr(cb_in_2));
+    for (int j = 0; j < 1024; j++)
+        ptr[j] = uint16_t(0);
+
+    for (int k = 0; k < 4; k++)
+    for (int j = 0; j < 16; j++)
+        ptr[k*256 + j] = uint16_t(u.u>>16);
+    cb_push_back(cb_in_2, 1);
+}
+
+void kernel_main() {
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3); // same arg index as in reader_unary and in reader_unary_transpose_wh_8bank
+
+    constexpr uint32_t cb_id_in0 = 0, cb_id_in1 = 1;
+
+    // ublocks size defined in tiles
+    constexpr uint32_t onetile = 1;
+    uint32_t tile_bytes = get_tile_size(cb_id_in0);
+
+    #ifdef KERNEL_COMPILE_TIME_ARG_0
+    constexpr bool read_from_dram = get_compile_time_arg_val(0);
+    #else
+    constexpr bool read_from_dram = true;
+    #endif
+
+    const InterleavedPow2AddrGen<read_from_dram> src_a = { src_addr, 11 };
+
+    #if GENERATE_BCAST_SCALER
+    // TODO(AP): cleanup, probably with named args/param pack/reflection.
+    generate_bcast_scaler();
+    constexpr uint32_t blk = BLOCK_SIZE;
+    #else
+    constexpr uint32_t blk = 1; // 1 for correctness for unfused kernels
+    #endif
+
+    #ifdef TILE_OFFSET
+    uint32_t tile_offset = TILE_OFFSET;
+    #else
+    constexpr uint32_t tile_offset = 0;
+    #endif
+    //DPRINT << "Reader Tile offset=" << tile_offset << ENDL();
+
+    // read a ublock of tiles from src to CB, and then push the ublock to unpacker
+    uint32_t i_tile = 0;
+    for (uint32_t i = 0; i<num_tiles; i += blk) {
+        uint32_t rem = blk; // (i + blk > num_tiles) ? num_tiles - i : blk;
+        cb_reserve_back(cb_id_in0, rem);
+        uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+
+        for (uint32_t r = 0; r<rem; r++) {
+            uint64_t src_noc_addr = get_noc_addr(i+r+tile_offset, src_a); // not contiguous for sequential r, can be banked
+            auto addr = l1_write_addr + (r<<11);
+            noc_async_read(src_noc_addr, addr, tile_bytes); // TODO(AP): data type size
+        }
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in0, rem);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank_reduce.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank_reduce.cpp
new file mode 100644
index 00000000000..ca910e1548e
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank_reduce.cpp
@@ -0,0 +1,8 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#define GENERATE_BCAST_SCALER 1
+#define BLOCK_SIZE 1
+
+#include "tt_metal/kernels/dataflow/reader_unary_8bank.cpp"
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp
new file mode 100644
index 00000000000..56ce1d1149a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t cb_id_in0 = 0;
+
+    // ublocks size defined in tiles
+    constexpr uint32_t ublock_size_tiles = 4;
+    uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles;
+
+    // read a ublock of tiles from src to CB, and then push the ublock to unpacker
+    for (uint32_t i = 0; i<num_tiles; i += ublock_size_tiles) {
+        uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+
+        cb_reserve_back(cb_id_in0, ublock_size_tiles);
+        uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+
+        noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, ublock_size_tiles);
+        src_addr += ublock_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_stick_layout_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_stick_layout_8bank.cpp
new file mode 100644
index 00000000000..8e3813d7d90
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_stick_layout_8bank.cpp
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+
+    // Constexpr
+    constexpr uint32_t num_dram_channels               = 8;
+    constexpr uint32_t log_base_2_of_num_dram_channels = 3;
+    constexpr uint32_t cb_id_in0                       = 0;
+
+    const uint32_t src_addr                 = get_arg_val<uint32_t>(0);
+    const uint32_t num_sticks               = get_arg_val<uint32_t>(1);
+    const uint32_t stick_size               = get_arg_val<uint32_t>(2);
+
+    // TODO(agrebenisan): This isn't good... here we are assuming
+    // that the stick size dictates tiles c, but stick size
+    // doesn't necessarily need to be divisible by tiles c...
+    // this is only the case really for tilize
+    const uint32_t num_tiles_c = stick_size / 64; // Assuming 2 bytes per datum, there are 64 bytes per tile row
+    uint32_t stick_id          = 0;
+
+    constexpr bool stick_size_is_power_of_two = (get_compile_time_arg_val(0) == 1);
+    #if (stick_size_is_power_of_two)
+    const uint32_t log_base_2_of_page_size = get_arg_val<uint32_t>(3);
+    const InterleavedPow2AddrGen<true> s = {
+        .bank_base_address = src_addr,
+
+
+        .log_base_2_of_page_size = log_base_2_of_page_size // TODO(AP): refactor
+    };
+    #else
+    const InterleavedAddrGen<true> s = {
+        .bank_base_address = src_addr,
+
+
+        .page_size = stick_size
+    };
+    #endif
+
+    for (uint32_t i = 0; i < num_sticks / 32; i++) {
+        // We reserve back an entire tile row and issue a bunch of reads
+        cb_reserve_back(cb_id_in0, num_tiles_c);
+        uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+        for (uint32_t j = 0; j < 32; j++) {
+            uint64_t src_noc_addr = get_noc_addr(
+                stick_id, s);
+
+            uint32_t bank_id = stick_id & (num_dram_channels - 1);
+            noc_async_read(src_noc_addr, l1_write_addr, stick_size);
+            l1_write_addr += stick_size;
+            stick_id++;
+        }
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in0, num_tiles_c);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp
new file mode 100644
index 00000000000..c74fe410677
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y = get_arg_val<uint32_t>(2);
+    // skip 3 for compat with reader_unary_8bank, reader_unary
+    uint32_t N = get_arg_val<uint32_t>(4);
+    uint32_t Ht = get_arg_val<uint32_t>(5);
+    uint32_t Wt = get_arg_val<uint32_t>(6);
+    uint32_t HtWt = get_arg_val<uint32_t>(7);
+    uint32_t HtWtTileBytes = HtWt*2048; // TODO(AP): assumed 16-bits
+    uint32_t WtTileBytes = Wt*2048; // TODO(AP): assumed 16-bits
+
+    constexpr uint32_t cb_id_in0 = 0;
+
+    // ublocks size defined in tiles
+    constexpr uint32_t onetile = 1;
+    uint32_t tile_bytes = get_tile_size(cb_id_in0);
+
+    uint32_t src_addrN = src_addr;
+    // this reader will read a NHW tensor in NWH order
+    for (uint32_t n = 0; n<N; n++) {
+        src_addr = src_addrN;
+        for (uint32_t w = 0; w<Wt; w++) {
+            for (uint32_t h = 0; h<Ht; h++) {
+                uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+                cb_reserve_back(cb_id_in0, onetile);
+                uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+                noc_async_read(src_noc_addr, l1_write_addr, tile_bytes);
+                noc_async_read_barrier();
+
+                cb_push_back(cb_id_in0, onetile);
+                src_addr += WtTileBytes; // stride in H
+            } // Ht
+            src_addr -= HtWtTileBytes; // go back to H=0
+            src_addr += tile_bytes; // increment Wt
+        } // Wt
+        src_addrN += HtWtTileBytes;
+    } // N
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_8bank.cpp
new file mode 100644
index 00000000000..a57ce6dea11
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_8bank.cpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+//#include "debug_print.h"
+
+void kernel_main() {
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    // skip args 1,2,3 for compat with reader_unary, reader_unary_8bank
+    uint32_t N = get_arg_val<uint32_t>(4); // args match the order of reader_unary
+    uint32_t Ht = get_arg_val<uint32_t>(5);
+    uint32_t Wt = get_arg_val<uint32_t>(6);
+    uint32_t HtWt = get_arg_val<uint32_t>(7);
+    uint32_t scaler = get_arg_val<uint32_t>(8);
+
+    constexpr uint32_t cb_id_in0 = 0;
+
+    // ublocks size defined in tiles
+    constexpr uint32_t onetile = 1;
+    uint32_t tile_bytes = get_tile_size(cb_id_in0);
+
+    if (scaler != 0) {
+        union { float f; uint32_t u; } u; u.u = scaler;
+        //DPRINT << "TWH Scaler = " << F32(u.f) << ENDL();
+        constexpr uint32_t cb_in_2 = 2;
+        cb_reserve_back(cb_in_2, 1);
+        auto ptr = reinterpret_cast<uint16_t*>(get_write_ptr(cb_in_2));
+        for (int j = 0; j < 1024; j++)
+            ptr[j] = uint16_t(0);
+
+        for (int k = 0; k < 4; k++)
+        for (int j = 0; j < 16; j++)
+            ptr[k*256 + j] = uint16_t(u.u>>16);
+        cb_push_back(cb_in_2, 1);
+    }
+
+    uint32_t i_tile_N = 0; // first tile in current batch
+    uint32_t i_tile = 0;
+
+    const InterleavedPow2AddrGen<true> s = {
+        .bank_base_address = src_addr,
+
+
+        .log_base_2_of_page_size = 11
+    };
+
+    // this reader will read a NHW tensor in NWH order
+    for (uint32_t n = 0; n<N; n++) {
+        i_tile = i_tile_N;
+        for (uint32_t w = 0; w<Wt; w++) {
+            for (uint32_t h = 0; h<Ht; h++) {
+                uint64_t src_noc_addr = get_noc_addr(i_tile, s);
+                cb_reserve_back(cb_id_in0, onetile);
+                uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+                noc_async_read(src_noc_addr, l1_write_addr, tile_bytes);
+                noc_async_read_barrier();
+
+                cb_push_back(cb_id_in0, onetile);
+                i_tile += Wt; // stride in H
+            } // Ht
+            i_tile -= HtWt; // go back to H=0
+            i_tile += 1; // increment Wt
+        } // Wt
+        i_tile_N += HtWt; // stride in batch/channel
+    } // N
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_interleaved.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_interleaved.cpp
new file mode 100644
index 00000000000..ec11fe0db77
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_interleaved.cpp
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src_addr = get_arg_val<uint32_t>(0);
+    uint32_t N = get_arg_val<uint32_t>(1);
+    uint32_t Ht = get_arg_val<uint32_t>(2);
+    uint32_t Wt = get_arg_val<uint32_t>(3);
+    uint32_t HtWt = get_arg_val<uint32_t>(4);
+
+    constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr uint32_t cb_id_in0 = 0;
+
+    // ublocks size defined in tiles
+    constexpr uint32_t onetile = 1;
+    const uint32_t tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat data_format = get_dataformat(cb_id_in0);
+
+    #ifdef REDUCE_SCALER
+    constexpr uint32_t cb_in_2 = 2;
+    constexpr uint32_t scaler = get_compile_time_arg_val(1);
+    cb_reserve_back(cb_in_2, 1);
+    if (scaler != 0) {
+        uint16_t u = uint16_t(scaler>>16);
+        auto ptr = reinterpret_cast<uint16_t*>(get_write_ptr(cb_in_2));
+        for (int j = 0; j < 1024; j++)
+            ptr[j] = uint16_t(0);
+
+        for (int k = 0; k < 4; k++)
+        for (int j = 0; j < 16; j++)
+            ptr[k*256 + j] = u;
+
+    }
+    cb_push_back(cb_in_2, 1);
+    #endif
+
+    uint32_t i_tile_N = 0; // first tile in current batch
+    uint32_t i_tile = 0;
+
+    const InterleavedAddrGenFast<src_is_dram> s = {
+        .bank_base_address = src_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+    // this reader will read a NHW tensor in NWH order
+    for (uint32_t n = 0; n<N; n++) {
+        i_tile = i_tile_N;
+        for (uint32_t w = 0; w<Wt; w++) {
+            for (uint32_t h = 0; h<Ht; h++) {
+                cb_reserve_back(cb_id_in0, onetile);
+                uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+                noc_async_read_tile(i_tile, s, l1_write_addr);
+                noc_async_read_barrier();
+
+                cb_push_back(cb_id_in0, onetile);
+                i_tile += Wt; // stride in H
+            } // Ht
+            i_tile -= HtWt; // go back to H=0
+            i_tile += 1; // increment Wt
+        } // Wt
+        i_tile_N += HtWt; // stride in batch/channel
+    } // N
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/receiver_intermediate_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/receiver_intermediate_stage.cpp
new file mode 100644
index 00000000000..b2ede2edc1a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/receiver_intermediate_stage.cpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+// #include "tools/profiler/kernel_profiler.hpp"
+
+void kernel_main() {
+
+    uint32_t sender_noc_x            = get_arg_val<uint32_t>(0);
+    uint32_t sender_noc_y            = get_arg_val<uint32_t>(1);
+    uint32_t num_tiles               = get_arg_val<uint32_t>(2);
+    uint32_t sender_semaphore_addr   = get_arg_val<uint32_t>(3);
+    uint32_t receiver_semaphore_addr = get_arg_val<uint32_t>(4);
+    uint32_t num_repetitions         = get_arg_val<uint32_t>(5);
+
+    volatile tt_l1_ptr uint32_t* receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(receiver_semaphore_addr);
+
+    constexpr uint32_t cb_id            = get_compile_time_arg_val(0);
+    constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1);
+
+    uint32_t block_size_bytes = get_tile_size(cb_id) * block_size_tiles;
+
+    uint64_t sender_semaphore_noc_addr = get_noc_addr(sender_noc_x, sender_noc_y, sender_semaphore_addr);
+
+    for (uint32_t j = 0; j < num_repetitions; j++) {
+        for (uint32_t i = 0; i<num_tiles ; i += block_size_tiles) {
+            cb_reserve_back(cb_id, block_size_tiles);
+
+            // Reset receiver's own semaphore value to INVALID
+            noc_semaphore_set(receiver_semaphore_addr_ptr, INVALID);
+
+            // Tell sender we're ready -- atomic increment sender's semaphore
+            noc_semaphore_inc(sender_semaphore_noc_addr, 1);
+
+            // Wait on receiver's own semaphore value to become VALID (set by sender after it sends the data)
+            noc_semaphore_wait(receiver_semaphore_addr_ptr, VALID);
+
+            cb_push_back(cb_id, block_size_tiles);
+        }
+    }
+
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp
new file mode 100644
index 00000000000..b791d770018
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "hostdevcommon/common_runtime_address_map.h"
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+constexpr static std::uint32_t VALID_VAL = 0x1234;
+constexpr static std::uint32_t INVALID_VAL = 0x4321;
+void kernel_main() {
+    std::uint32_t buffer_src_addr             = get_arg_val<uint32_t>(0);
+    std::uint32_t src_noc_x                   = get_arg_val<uint32_t>(1);
+    std::uint32_t src_noc_y                   = get_arg_val<uint32_t>(2);
+    std::uint32_t buffer_dst_addr             = get_arg_val<uint32_t>(3);
+    std::uint32_t dst_noc_x                   = get_arg_val<uint32_t>(4);
+    std::uint32_t dst_noc_y                   = get_arg_val<uint32_t>(5);
+    std::uint32_t l1_buffer_address           = get_arg_val<uint32_t>(6);
+    std::uint32_t stream_register_address     = get_arg_val<uint32_t>(7);
+    std::uint32_t num_tiles                   = get_arg_val<uint32_t>(8);
+    std::uint32_t transient_buffer_size_tiles = get_arg_val<uint32_t>(9);
+    std::uint32_t transient_buffer_size_bytes = get_arg_val<uint32_t>(10);
+
+    // Scratch address in L1, two write register value before we copy it to into local/remote registers
+    volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(CONSTANT_REGISTER_VALUE);
+    *(constant_ptr) = INVALID_VAL;
+
+    std::uint32_t counter = 0;
+    // src noc address
+    std::uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, buffer_src_addr);
+    // Local and remote register addresses (used for sync)
+    std::uint64_t local = get_noc_addr(stream_register_address);
+    std::uint64_t remote= get_noc_addr(src_noc_x, src_noc_y, stream_register_address);
+
+    std::uint32_t dst_buffer_addr = buffer_dst_addr;
+    while(counter < num_tiles) {
+        // Wait until sync register is VALID_VAL (means its safe to read data from source buffer into operand buffer)
+        wait_for_sync_register_value(stream_register_address, VALID_VAL);
+        noc_async_read(src_noc_addr, l1_buffer_address, transient_buffer_size_bytes);
+        noc_async_read_barrier();
+
+        // DRAM NOC dst address
+        std::uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_buffer_addr);
+        noc_async_write(l1_buffer_address, dst_noc_addr, transient_buffer_size_bytes);
+
+        dst_buffer_addr += transient_buffer_size_bytes;
+
+        // Write INVALID_VAL into local register
+        noc_async_write(CONSTANT_REGISTER_VALUE, local, 4);
+        noc_async_write_barrier();
+
+        noc_async_write(CONSTANT_REGISTER_VALUE, remote, 4);
+        noc_async_write_barrier();
+
+        counter += transient_buffer_size_tiles;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp
new file mode 100644
index 00000000000..65d0443c414
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "hostdevcommon/common_runtime_address_map.h"
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+constexpr static std::uint32_t VALID_VAL = 0x1234;
+constexpr static std::uint32_t INVALID_VAL = 0x4321;
+
+inline std::uint32_t ping_pong_address(std::uint32_t addr1, std::uint32_t addr2, std::uint32_t index) {
+    if((index & 0x1) == 0) {
+        return addr1;
+    } else {
+        return addr2;
+    }
+}
+
+void kernel_main() {
+    std::uint32_t buffer_src_addr1             = get_arg_val<uint32_t>(0);
+    std::uint32_t buffer_src_addr2             = get_arg_val<uint32_t>(1);
+    std::uint32_t src_noc_x                    = get_arg_val<uint32_t>(2);
+    std::uint32_t src_noc_y                    = get_arg_val<uint32_t>(3);
+    std::uint32_t buffer_dst_addr              = get_arg_val<uint32_t>(4);
+    std::uint32_t dst_noc_x                    = get_arg_val<uint32_t>(5);
+    std::uint32_t dst_noc_y                    = get_arg_val<uint32_t>(6);
+    std::uint32_t local_buffer_addr1           = get_arg_val<uint32_t>(7);
+    std::uint32_t local_buffer_addr2           = get_arg_val<uint32_t>(8);
+    std::uint32_t stream_register_address1     = get_arg_val<uint32_t>(9);
+    std::uint32_t stream_register_address2     = get_arg_val<uint32_t>(10);
+    std::uint32_t num_tiles                    = get_arg_val<uint32_t>(11);
+    std::uint32_t transient_buffer_size_tiles  = get_arg_val<uint32_t>(12);
+    std::uint32_t transient_buffer_size_bytes  = get_arg_val<uint32_t>(13);
+
+    // Scratch address in L1, two write register value before we copy it to into local/remote registers
+    volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(CONSTANT_REGISTER_VALUE);
+    *(constant_ptr) = INVALID_VAL;
+
+    std::uint32_t counter = 0;
+    std::uint32_t dst_buffer_addr = buffer_dst_addr;
+    std::uint64_t dst_noc_addr;
+    while(counter < num_tiles) {
+        std::uint32_t reg_addr = ping_pong_address(stream_register_address1, stream_register_address2, counter);
+        std::uint64_t local = get_noc_addr(reg_addr);
+        std::uint64_t remote = get_noc_addr(src_noc_x, src_noc_y, reg_addr);
+        std::uint32_t local_buffer_address = ping_pong_address(local_buffer_addr1, local_buffer_addr2, counter);
+        std::uint32_t src_buffer_address = ping_pong_address(buffer_src_addr1, buffer_src_addr2, counter);
+        std::uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_buffer_address);
+
+        // Wait until sync register is VALID_VAL (means its safe to read data from source buffer into operand buffer)
+        wait_for_sync_register_value(reg_addr, VALID_VAL);
+        noc_async_read(src_noc_addr, local_buffer_address, transient_buffer_size_bytes);
+        noc_async_read_barrier();
+
+        // DRAM NOC dst address
+        dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_buffer_addr);
+        noc_async_write(local_buffer_address, dst_noc_addr, transient_buffer_size_bytes);
+
+        dst_buffer_addr += transient_buffer_size_bytes;
+
+        noc_async_write(CONSTANT_REGISTER_VALUE, local, 4);
+        noc_async_write_barrier();
+
+        noc_async_write(CONSTANT_REGISTER_VALUE, remote, 4);
+        noc_async_write_barrier();
+
+        counter += transient_buffer_size_tiles;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/sender_intermediate_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/sender_intermediate_stage.cpp
new file mode 100644
index 00000000000..f4196705869
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/sender_intermediate_stage.cpp
@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+#include "hostdevcommon/common_values.hpp"
+// #include "tools/profiler/kernel_profiler.hpp"
+
+void kernel_main() {
+
+    uint32_t receiver_noc_x          = get_arg_val<uint32_t>(0);
+    uint32_t receiver_noc_y          = get_arg_val<uint32_t>(1);
+    uint32_t num_tiles               = get_arg_val<uint32_t>(2);
+    uint32_t sender_semaphore_addr   = get_arg_val<uint32_t>(3);
+    uint32_t receiver_semaphore_addr = get_arg_val<uint32_t>(4);
+    uint32_t l1_valid_value_addr     = get_arg_val<uint32_t>(5);
+    uint32_t num_repetitions         = get_arg_val<uint32_t>(6);
+
+    // initialized by the host to 0 before program launch
+    volatile tt_l1_ptr uint32_t* sender_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(sender_semaphore_addr);
+    // local valid value in L1
+    volatile tt_l1_ptr uint32_t* l1_valid_value_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(l1_valid_value_addr);
+    *(l1_valid_value_addr_ptr) = VALID;
+
+    constexpr uint32_t cb_id            = get_compile_time_arg_val(0);
+    constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1);
+
+    uint32_t block_size_bytes = get_tile_size(cb_id) * block_size_tiles;
+
+    uint64_t receiver_semaphore_noc_addr = get_noc_addr(receiver_noc_x, receiver_noc_y, receiver_semaphore_addr);
+
+    for (uint32_t j = 0; j < num_repetitions; j++) {
+        for (uint32_t i = 0; i < num_tiles; i += block_size_tiles) {
+
+            // wait until receiver has set the sender's semaphore_addr value to 1, which means receiver has reserved space in the CB
+            noc_semaphore_wait(sender_semaphore_addr_ptr, 1);
+
+            if (i > 0) {
+                cb_pop_front(cb_id, block_size_tiles);
+            }
+            cb_wait_front(cb_id, block_size_tiles);
+            uint32_t l1_addr = get_read_ptr(cb_id);
+
+            // now we have the block in the CB (at l1_addr), we can send to receiver
+            uint64_t receiver_data_noc_addr      = get_noc_addr(receiver_noc_x, receiver_noc_y, l1_addr);
+            noc_async_write(l1_addr, receiver_data_noc_addr, block_size_bytes);
+
+            // set the sender's semaphore value back to zero for the next block
+            // we need to reset before we set the receiver's semaphore
+            noc_semaphore_set(sender_semaphore_addr_ptr, 0);
+
+            // we now set the receiver's semaphore, so that it knows that the data has been written to the CB
+            // must use noc_semaphore_set_remote and not noc_semaphore_inc in the sender
+            // because we need to ensure that data is written to the remote CB before we set the semaphore
+            // noc_async_write and noc_semaphore_set_remote are ordered
+            noc_semaphore_set_remote(l1_valid_value_addr, receiver_semaphore_noc_addr);
+
+            // this barrier is not needed, sempahore inter-lock already guarantees that we won't overwrite local CB with new data
+            // ie, it is safe to pop here, because the data in the CB won't actually be overwritten until the receiver has set the semaphore (which means it was received)
+            // this barrier would hurt performance for smaller transfers (<16KB), but for larger transfers it wouldn't make a difference
+            // noc_async_write_barrier();
+        }
+        cb_pop_front(cb_id, block_size_tiles);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/test_compile_args.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/test_compile_args.cpp
new file mode 100644
index 00000000000..8e7b4bce00f
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/test_compile_args.cpp
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "debug_print.h"
+
+void kernel_main() {
+    DPRINT<<"Kernel Compile Time Args"<<ENDL();
+    DPRINT<<get_compile_time_arg_val(0)<<ENDL();
+    DPRINT<<get_compile_time_arg_val(1)<<ENDL();
+    DPRINT<<get_compile_time_arg_val(2)<<ENDL();
+    DPRINT<<get_compile_time_arg_val(3)<<ENDL();
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp
new file mode 100644
index 00000000000..3eeaff86ae4
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp
@@ -0,0 +1,119 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+#include "debug_print.h"
+
+using uint32_t = std::uint32_t;
+
+// tile index to address
+inline uint32_t TADDR(uint32_t ti) {
+    return ti << 11;
+}
+
+void kernel_main() {
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t W         = get_arg_val<uint32_t>(3);
+    uint32_t H         = get_arg_val<uint32_t>(4);
+    uint32_t C         = get_arg_val<uint32_t>(5);
+    uint32_t HW        = get_arg_val<uint32_t>(6);
+    uint32_t N         = get_arg_val<uint32_t>(7);
+    uint32_t CHW       = get_arg_val<uint32_t>(8);
+
+    auto WT = (W >> 5); // number of tiles in W
+    auto HT = (H >> 5); // number of tiles in H
+    auto CT = (C >> 5); // number of tiles in C
+    auto HTWT = (HW >> 10); // product of HT*WT
+    auto HW2 = (HW << 1); // HW stride in bytes
+    auto CHW2 = (CHW << 1); // batch stride in bytes
+    constexpr uint32_t SUBTILE_LINE_BYTES = (16<<1);
+    constexpr uint32_t onetile = 1;
+    constexpr uint32_t operand0 = 0;
+
+
+    // The basic idea here is to iterate over output tiles (that will be over CT,WT) and H
+    // this will generate a linearly incremented output address in the inner loop
+    // we then reverse map this linear dest address to src address
+    uint64_t batch_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+    for (uint32_t n = 0; n < N; n++) {
+        uint32_t htWT = 0;
+        for (uint32_t h = 0; h < H; h++) {
+            uint32_t ctoffs = 0;
+            for (uint32_t ct = 0; ct < CT; ct++) {
+                for (uint32_t wt = 0; wt < WT; wt++) {
+                    // what is the source address for the current tile?
+                    // c32 = intra-C-tile loop
+                    // every 32 C's acquire a new output tile address
+                    //        DPRINT << "h=" << h << " ct=" << ct << " wt=" << wt << " W=" << W << " HW2=" << HW2 << ENDL();
+
+                    cb_reserve_back(operand0, onetile);
+
+                    uint32_t dest_tr0_l1 = get_write_ptr(operand0);
+                    uint32_t save_dest = dest_tr0_l1;
+                    uint32_t cSubtileOffs = 0;
+                    for (uint32_t sub = 0; sub < 4; sub++) {
+                        uint32_t c16offs = cSubtileOffs;
+                        for (uint32_t c16 = 0; c16 < 16; c16++) {
+                            // In this loop sub, c16 are source subtile, c16
+                            // dest in this loop is varying h implicitly via dest address increment
+
+                            // Dest is HCW
+                            // We are iterating over it as H Ct Wt-tiles
+                            // intra-tile FC16 for F going over 4-subtiles
+                            // the source address is (bytes):
+                            // src_addr = c*HW2 + (ht*Wt + wt)*1024*2 + f*256*2 + (h16*16 + w16)*2
+                            // we have 512 bytes per subtile and 32 bytes per subtile row of 16 elems
+                            // here sub<<9 is multiply by 512 which offset in bytes of a subtile
+                            // note that dest h is decomposed as h = ht+h32 and htWT is incremented by WT in the outer H loop
+                            auto h32 = (h&31);
+                            // TODO(AP): not really trivial need better comments here
+                            auto sub_src_offs = (sub & 1) << 9; // if dest subtile w==16, add 512 to src subtile offset
+                            sub_src_offs += (((h32 >> 4) << 1) << 9); // if intra-tile source h is > 16, add 2*512 to subtile offset
+                            // below we only use the lower 4 bits out of 5-bit range for h, shift by 5 because 2 bytes per element
+                            auto src_offs = ctoffs + c16offs + TADDR(htWT + wt) + sub_src_offs + ((h32&15)<<5); // bytes offset
+                            auto src_addr = batch_addr + src_offs;
+
+                            //if (h == 0 && ct == 0 && wt == 0) {
+                            //    DPRINT << "  Sub=" << sub << " c16=" << c16 << ENDL();
+                            //    DPRINT << "    Reading from src_offs=" << src_offs << ENDL();
+                            //    DPRINT << "    Writing to   dst_offs=" << dest_tr0_l1-save_dest << ENDL();
+                            //}
+
+                            // this starts async NOC dma from DRAM to TR0_L1 buffer
+                            noc_async_read(src_addr, dest_tr0_l1, SUBTILE_LINE_BYTES);
+
+                            //if (h == 0 && ct == 0 && wt == 0)
+                            //    DPRINT << uint32_t( reinterpret_cast<uint16_t*>( dest_tr0_l1 )[0] ) << ENDL();
+
+                            // the output address is just linearly incremented
+                            dest_tr0_l1 += SUBTILE_LINE_BYTES;
+                            c16offs += HW2;
+                        }
+                        // subtiles are ordered like this:
+                        // 0 1
+                        // 2 3
+                        // Here we offset C by 16 starting with subtile=2
+                        if (sub == 1) // after we are done with subtile 1, increment for sub=2
+                            cSubtileOffs += (HW2<<4); // 16*HWbytes, which is subtile vertical size
+                    } // sub<4
+
+                    // block on all outstanding noc DMA requests to complete
+                    noc_async_read_barrier();
+
+                    // notifies the unpacker that the buffer is populated
+                    cb_push_back(operand0, onetile);
+                }
+                ctoffs += (HW2<<5); // since we increment ct, we need to mlutiply by 32
+            } // ct loop
+            // multiplication-free computation of ht*WT, since ht = h/32
+            if ((h&31) == 31)
+                htWT += WT;
+        } // h < H loop
+        batch_addr += CHW2;
+    } // n<N loop
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp
new file mode 100644
index 00000000000..61a91a7ab9e
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp
@@ -0,0 +1,132 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+#include "debug_print.h"
+
+using uint32_t = std::uint32_t;
+
+// tile index to address
+inline uint32_t TADDR(uint32_t ti) {
+    return ti << 11;
+}
+
+void kernel_main() {
+    uint32_t src0_addr = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t W         = get_arg_val<uint32_t>(3);
+    uint32_t H         = get_arg_val<uint32_t>(4);
+    uint32_t C         = get_arg_val<uint32_t>(5);
+    uint32_t HW        = get_arg_val<uint32_t>(6);
+    uint32_t N         = get_arg_val<uint32_t>(7);
+    uint32_t CHW       = get_arg_val<uint32_t>(8);
+
+    auto WT = (W >> 5); // number of tiles in W
+    auto HT = (H >> 5); // number of tiles in H
+    auto CT = (C >> 5); // number of tiles in C
+    auto HTWT = (HW >> 10); // product of HT*WT
+    auto HW2 = (HW << 1); // HW stride in bytes
+    auto CHW2 = (CHW << 1); // batch stride in bytes
+    constexpr uint32_t SUBTILE_LINE_BYTES = (16<<1);
+    constexpr uint32_t onetile = 1;
+    constexpr uint32_t operand0 = 0;
+
+
+    // The basic idea here is to iterate over output tiles (that will be over CT,WT) and H
+    // this will generate a linearly incremented output address in the inner loop
+    // we then reverse map this linear dest address to src address
+
+    const InterleavedPow2AddrGen<true> s0 = {
+        .bank_base_address = src0_addr,
+
+
+        .log_base_2_of_page_size = 11
+    };
+
+    uint64_t batch_addr = src0_addr;
+    for (uint32_t n = 0; n < N; n++) {
+        uint32_t htWT = 0;
+        for (uint32_t h = 0; h < H; h++) {
+            uint32_t ctoffs = 0;
+            for (uint32_t ct = 0; ct < CT; ct++) {
+                for (uint32_t wt = 0; wt < WT; wt++) {
+                    // what is the source address for the current tile?
+                    // c32 = intra-C-tile loop
+                    // every 32 C's acquire a new output tile address
+                    //    DPRINT << "8B h=" << h << " ct=" << ct << " wt=" << wt << " W=" << W << " HW2=" << HW2 << ENDL();
+
+                    cb_reserve_back(operand0, onetile);
+
+                    uint32_t dest_tr0_l1 = get_write_ptr(operand0);
+                    uint32_t save_dest = dest_tr0_l1;
+                    uint32_t cSubtileOffs = 0;
+                    for (uint32_t sub = 0; sub < 4; sub++) {
+                        uint32_t c16offs = cSubtileOffs;
+                        for (uint32_t c16 = 0; c16 < 16; c16++) {
+                            // In this loop sub, c16 are source subtile, c16
+                            // dest in this loop is varying h implicitly via dest address increment
+
+                            // Dest is HCW
+                            // We are iterating over it as H Ct Wt-tiles
+                            // intra-tile FC16 for F going over 4-subtiles
+                            // the source address is (bytes):
+                            // src_addr = c*HW2 + (ht*Wt + wt)*1024*2 + f*256*2 + (h16*16 + w16)*2
+                            // we have 512 bytes per subtile and 32 bytes per subtile row of 16 elems
+                            // here sub<<9 is multiply by 512 which offset in bytes of a subtile
+                            // note that dest h is decomposed as h = ht+h32 and htWT is incremented by WT in the outer H loop
+                            auto h32 = (h&31);
+                            // TODO(AP): not really trivial need better comments here
+                            auto sub_src_offs = (sub & 1) << 9; // if dest subtile w==16, add 512 to src subtile offset
+                            sub_src_offs += (((h32 >> 4) << 1) << 9); // if intra-tile source h is > 16, add 2*512 to subtile offset
+                            // below we only use the lower 4 bits out of 5-bit range for h, shift by 5 because 2 bytes per element
+                            auto src_offs = ctoffs + c16offs + TADDR(htWT + wt) + sub_src_offs + ((h32&15)<<5); // bytes offset
+                            auto bsrc_offs = (batch_addr + src_offs)-src0_addr;
+                            uint32_t batch_itile = (bsrc_offs >> 11);
+                            uint32_t rem = (bsrc_offs & 2047);
+
+                            //if (h == 0 && ct == 0 && wt == 0) {
+                            //    DPRINT << "  Sub=" << sub << " c16=" << c16 << ENDL();
+                            //    DPRINT << "    Reading from src_offs=" << src_offs << ENDL();
+                            //    DPRINT << "    Writing to   dst_offs=" << dest_tr0_l1-save_dest << ENDL();
+                            //}
+
+                            uint64_t banked_addr = get_noc_addr(batch_itile, s0);
+                            banked_addr += rem;
+
+                            // this starts async NOC dma from DRAM to TR0_L1 buffer
+                            noc_async_read(banked_addr, dest_tr0_l1, SUBTILE_LINE_BYTES);
+
+                            //if (h == 0 && ct == 0 && wt == 0)
+                            //    DPRINT << uint32_t( reinterpret_cast<uint16_t*>( dest_tr0_l1 )[0] ) << ENDL();
+
+                            // the output address is just linearly incremented
+                            dest_tr0_l1 += SUBTILE_LINE_BYTES;
+                            c16offs += HW2;
+                        }
+                        // subtiles are ordered like this:
+                        // 0 1
+                        // 2 3
+                        // Here we offset C by 16 starting with subtile=2
+                        if (sub == 1) // after we are done with subtile 1, increment for sub=2
+                            cSubtileOffs += (HW2<<4); // 16*HWbytes, which is subtile vertical size
+                    } // sub<4
+
+                    // block on all outstanding noc DMA requests to complete
+                    noc_async_read_barrier();
+
+                    // notifies the unpacker that the buffer is populated
+                    cb_push_back(operand0, onetile);
+                }
+                ctoffs += (HW2<<5); // since we increment ct, we need to mlutiply by 32
+            } // ct loop
+            // multiplication-free computation of ht*WT, since ht = h/32
+            if ((h&31) == 31)
+                htWT += WT;
+        } // h < H loop
+        batch_addr += CHW2;
+    } // n<N loop
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp
new file mode 100644
index 00000000000..cf77ec540e5
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    for (uint32_t i = 0; i < 20; i++) {
+        uint32_t load = *reinterpret_cast<volatile tt_l1_ptr uint32_t*>(400 * 1024);
+        uint32_t local_load1 = *reinterpret_cast<volatile uint32_t*>(MEM_LOCAL_BASE);
+        uint32_t local_load2 = *reinterpret_cast<volatile uint32_t*>(MEM_LOCAL_BASE);
+        uint32_t local_load3 = *reinterpret_cast<volatile uint32_t*>(MEM_LOCAL_BASE);
+        uint32_t local_load4 = *reinterpret_cast<volatile uint32_t*>(MEM_LOCAL_BASE);
+        uint32_t local_load5 = *reinterpret_cast<volatile uint32_t*>(MEM_LOCAL_BASE);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_dram.cpp
new file mode 100644
index 00000000000..c694971cdf7
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_dram.cpp
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    constexpr std::uint32_t cb_id = get_compile_time_arg_val(0);
+    constexpr std::uint32_t page_size = get_compile_time_arg_val(1);
+    std::uint32_t src_addr_base = get_arg_val<uint32_t>(0);
+    std::uint32_t num_tiles = get_arg_val<uint32_t>(1);
+
+    constexpr bool IS_DRAM = true;
+    const uint32_t ublock_size_tiles = 1;
+    uint32_t tile_bytes = get_tile_size(cb_id);
+    InterleavedAddrGen<IS_DRAM> src_addrgen = {
+        .bank_base_address = src_addr_base,
+        .page_size = page_size,
+    };
+
+    // read tiles from src to CB
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t src_noc_addr = get_noc_addr(i, src_addrgen);
+
+        cb_reserve_back(cb_id, ublock_size_tiles);
+        uint32_t l1_write_addr = get_write_ptr(cb_id);
+        noc_async_read(src_noc_addr, l1_write_addr, tile_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id, ublock_size_tiles);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_unary.cpp
new file mode 100644
index 00000000000..4593299dd01
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_reader_unary.cpp
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    constexpr std::uint32_t cb_id = get_compile_time_arg_val(0);
+    constexpr std::uint32_t page_size = get_compile_time_arg_val(1);
+    std::uint32_t src_addr_base = get_arg_val<uint32_t>(0);
+    std::uint32_t num_tiles = get_arg_val<uint32_t>(1);
+
+    constexpr bool IS_DRAM = false;
+    const uint32_t ublock_size_tiles = 1;
+    uint32_t tile_bytes = get_tile_size(cb_id);
+    InterleavedAddrGen<IS_DRAM> src_addrgen = {
+        .bank_base_address = src_addr_base,
+        .page_size = page_size,
+    };
+
+    // read tiles from src to CB
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t src_noc_addr = get_noc_addr(i, src_addrgen);
+
+        cb_reserve_back(cb_id, ublock_size_tiles);
+        uint32_t l1_write_addr = get_write_ptr(cb_id);
+        noc_async_read(src_noc_addr, l1_write_addr, tile_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id, ublock_size_tiles);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_dram.cpp
new file mode 100644
index 00000000000..67105917c1c
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_dram.cpp
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    constexpr std::uint32_t cb_id = get_compile_time_arg_val(0);
+    constexpr std::uint32_t page_size = get_compile_time_arg_val(1);
+    std::uint32_t dst_addr_base = get_arg_val<uint32_t>(0);
+    std::uint32_t num_tiles = get_arg_val<uint32_t>(1);
+
+    constexpr bool IS_DRAM = true;
+    const uint32_t ublock_size_tiles = 1;
+    uint32_t tile_bytes = get_tile_size(cb_id);
+    InterleavedAddrGen<IS_DRAM> dst_addrgen = {
+        .bank_base_address = dst_addr_base,
+        .page_size = page_size,
+    };
+
+    // Write tiles from CB to dram(interleaved)
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t dst_noc_addr = get_noc_addr(i, dst_addrgen);
+
+        cb_wait_front(cb_id, ublock_size_tiles);
+        uint32_t l1_read_ptr = get_read_ptr(cb_id);
+        noc_async_write(l1_read_ptr, dst_noc_addr, tile_bytes);
+
+        noc_async_write_barrier();
+
+        cb_pop_front(cb_id, ublock_size_tiles);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_unary.cpp
new file mode 100644
index 00000000000..99caf9fbb62
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/banked_writer_unary.cpp
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    constexpr std::uint32_t cb_id = get_compile_time_arg_val(0);
+    constexpr std::uint32_t page_size = get_compile_time_arg_val(1);
+    std::uint32_t dst_addr_base = get_arg_val<uint32_t>(0);
+    std::uint32_t num_tiles = get_arg_val<uint32_t>(1);
+
+    constexpr bool IS_DRAM = false;
+    const uint32_t ublock_size_tiles = 1;
+    uint32_t tile_bytes = get_tile_size(cb_id);
+    InterleavedAddrGen<IS_DRAM> dst_addrgen = {
+        .bank_base_address = dst_addr_base,
+        .page_size = page_size,
+    };
+
+    // Write tiles from CB to dram(interleaved)
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t dst_noc_addr = get_noc_addr(i, dst_addrgen);
+
+        cb_wait_front(cb_id, ublock_size_tiles);
+        uint32_t l1_read_ptr = get_read_ptr(cb_id);
+        noc_async_write(l1_read_ptr, dst_noc_addr, tile_bytes);
+
+        noc_async_write_barrier();
+
+        cb_pop_front(cb_id, ublock_size_tiles);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_binary.cpp
new file mode 100644
index 00000000000..1d4884ba7f4
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_binary.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    const uint32_t cb_id = get_compile_time_arg_val(0);
+    const uint32_t cb_id = get_compile_time_arg_val(0);
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    // ublocks size defined in tiles
+    constexpr uint32_t ublock_size_tiles = 1;
+    uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles;
+
+    // read a ublock of tiles from src to CB, and then push the ublock to unpacker
+    for (uint32_t i = 0; i<num_tiles; i += ublock_size_tiles) {
+        uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+
+        cb_reserve_back(cb_id, ublock_size_tiles);
+        uint32_t l1_write_addr = get_write_ptr(cb_id);
+        noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id, ublock_size_tiles);
+        src_addr += ublock_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp
new file mode 100644
index 00000000000..348eeb4301c
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+void kernel_main() {
+    std::uint32_t dram_buffer_src_addr_base  = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_src_noc_x             = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_src_noc_y             = get_arg_val<uint32_t>(2);
+
+    std::uint32_t l1_buffer_dst_addr_base    = get_arg_val<uint32_t>(3);
+    std::uint32_t dram_buffer_size           = get_arg_val<uint32_t>(4);
+
+    std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base;
+    // DRAM NOC src address
+    std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr);
+
+    noc_async_read(dram_buffer_src_noc_addr, l1_buffer_dst_addr_base, dram_buffer_size);
+    noc_async_read_barrier();
+
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp
new file mode 100644
index 00000000000..615a4c02274
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp
@@ -0,0 +1,33 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    const uint32_t cb_id = get_compile_time_arg_val(0);
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    // ublocks size defined in tiles
+    constexpr uint32_t ublock_size_tiles = 1;
+    uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles;
+
+    // read a ublock of tiles from src to CB, and then push the ublock to unpacker
+    for (uint32_t i = 0; i<num_tiles; i += ublock_size_tiles) {
+        uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+
+        cb_reserve_back(cb_id, ublock_size_tiles);
+        uint32_t l1_write_addr = get_write_ptr(cb_id);
+        noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id, ublock_size_tiles);
+        src_addr += ublock_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp
new file mode 100644
index 00000000000..6defaae49dc
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+/**
+ * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on NCRISC or other RISCs
+ * Any two RISC processors cannot use the same CMD_BUF
+ * non_blocking APIs shouldn't be mixed with slow noc.h APIs
+ * explicit flushes need to be used since the calls are non-blocking
+ * */
+void kernel_main() {
+    std::uint32_t dram_buffer_dst_addr_base  = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_dst_noc_x             = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_dst_noc_y             = get_arg_val<uint32_t>(2);
+
+    std::uint32_t l1_buffer_src_addr_base    = get_arg_val<uint32_t>(3);
+    std::uint32_t dram_buffer_size           = get_arg_val<uint32_t>(4);
+
+    std::uint32_t dram_buffer_dst_addr = dram_buffer_dst_addr_base;
+
+    // DRAM NOC dst address
+    std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr);
+
+    noc_async_write(l1_buffer_src_addr_base, dram_buffer_dst_noc_addr, dram_buffer_size);
+    noc_async_write_barrier();
+
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp
new file mode 100644
index 00000000000..2ed57a38744
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    const uint32_t cb_id = get_compile_time_arg_val(0);
+    uint32_t dst_addr  = get_arg_val<uint32_t>(0);
+    uint32_t dst_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t dst_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes = get_tile_size(cb_id);
+    uint32_t ublock_size_tiles = 1;
+
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr);
+
+        cb_wait_front(cb_id, ublock_size_tiles);
+        uint32_t l1_read_addr = get_read_ptr(cb_id);
+        noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes);
+
+        noc_async_write_barrier();
+
+        cb_pop_front(cb_id, ublock_size_tiles);
+        dst_addr += ublock_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_bmm_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_bmm_8bank.cpp
new file mode 100644
index 00000000000..ea72f5598f7
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_bmm_8bank.cpp
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+//#include "debug_print.h"
+
+void kernel_main() {
+    // same arg indices as in reader_bmm_8bank for reuse
+    uint32_t dst_addr   = get_arg_val<uint32_t>(0);
+    uint32_t Mt         = get_arg_val<uint32_t>(2);
+    uint32_t Nt         = get_arg_val<uint32_t>(4);
+    uint32_t batch      = get_arg_val<uint32_t>(7);
+
+    constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
+
+    constexpr int onetile = 1;
+    constexpr uint32_t cb_id_out0 = 16;
+    const uint32_t tile_bytes = get_tile_size(cb_id_out0);
+    uint32_t itileC = 0;
+    const DataFormat data_format = get_dataformat(cb_id_out0);
+
+    const InterleavedAddrGenFast<dst_is_dram> s = {
+        .bank_base_address = dst_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+    // C is MN so we iterate in tile RM order
+    for (uint32_t nb = 0; nb < batch; nb ++)
+    for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C)   // output tile of C
+    for (uint32_t nt_C = 0; nt_C < Nt; ++nt_C) { // output tile index of C
+        // bmm will generate C's tiles C=A*B, MN=MK*KN, in row major order, we just read them from CB and write out to DRAM
+        cb_wait_front(cb_id_out0, onetile);
+        uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+        noc_async_write_tile(itileC, s, l1_read_addr);
+        noc_async_write_barrier();
+        cb_pop_front(cb_id_out0, onetile);
+        //DPRINT << 'W' << 'C' << itileC << ' ' << 'a' << dst_addr << ENDL();
+        //DPRINT << itileC << ' ' << uint32_t(dst_noc_addr) << ENDL();
+        itileC ++;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp
new file mode 100644
index 00000000000..4d62bbda8b5
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+inline __attribute__((always_inline))
+void pop_from_cb_and_write(const uint32_t cb_id, uint32_t num_tiles_per_cb, uint32_t ublock_size_tiles, uint32_t ublock_size_bytes,
+                               uint32_t dram_dst_noc_x, uint32_t dram_dst_noc_y, uint32_t& dram_buffer_dst_addr) {
+    for (uint32_t i = 0; i < num_tiles_per_cb; i += ublock_size_tiles) {
+        // DRAM NOC dst address
+        std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr);
+
+        cb_wait_front(cb_id, ublock_size_tiles);
+        uint32_t l1_read_addr = get_read_ptr(cb_id);
+
+        noc_async_write(l1_read_addr, dram_buffer_dst_noc_addr, ublock_size_bytes);
+        noc_async_write_barrier();
+        cb_pop_front(cb_id, ublock_size_tiles);
+        dram_buffer_dst_addr += ublock_size_bytes;
+    }
+}
+
+void kernel_main() {
+    std::uint32_t dram_buffer_dst_addr  = get_arg_val<uint32_t>(0);
+    std::uint32_t dram_dst_noc_x        = get_arg_val<uint32_t>(1);
+    std::uint32_t dram_dst_noc_y        = get_arg_val<uint32_t>(2);
+    std::uint32_t num_tiles_per_cb      = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t cb_id = get_compile_time_arg_val(0);
+    constexpr uint32_t ublock_size_tiles = get_compile_time_arg_val(1);
+    uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles;
+
+    pop_from_cb_and_write(cb_id, num_tiles_per_cb, ublock_size_tiles, ublock_size_bytes,
+                              dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr);
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp
new file mode 100644
index 00000000000..4a06fd0043d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+// #include "tools/profiler/kernel_profiler.hpp"
+
+void kernel_main() {
+
+    std::uint32_t buffer_dst_addr  = get_arg_val<uint32_t>(0);
+    std::uint32_t dst_noc_x        = get_arg_val<uint32_t>(1);
+    std::uint32_t dst_noc_y        = get_arg_val<uint32_t>(2);
+    std::uint32_t num_tiles             = get_arg_val<uint32_t>(3);
+    std::uint32_t num_repetitions       = get_arg_val<uint32_t>(4);
+
+    constexpr uint32_t cb_id             = get_compile_time_arg_val(0);
+    constexpr uint32_t block_size_tiles  = get_compile_time_arg_val(1);
+
+    uint32_t block_size_bytes = get_tile_size(cb_id) * block_size_tiles;
+
+    for (uint32_t j = 0; j < num_repetitions; j++) {
+        uint32_t dst_addr = buffer_dst_addr;
+        for (uint32_t i = 0; i < num_tiles; i += block_size_tiles) {
+            std::uint64_t buffer_dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr);
+
+            cb_wait_front(cb_id, block_size_tiles);
+
+            if (j == 0) {
+                uint32_t l1_read_addr = get_read_ptr(cb_id);
+                noc_async_write(l1_read_addr, buffer_dst_noc_addr, block_size_bytes);
+                noc_async_write_barrier();
+
+                // some delay to test backpressure
+                // volatile uint32_t *l1_read_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(BRISC_BREAKPOINT);
+                // for (int delay = 0; delay < 10000; delay++) {
+                //     *l1_read_addr_ptr = 1;
+                // }
+            }
+
+            cb_pop_front(cb_id, block_size_tiles);
+            dst_addr += block_size_bytes;
+        }
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp
new file mode 100644
index 00000000000..c2a73cf70cb
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+
+
+    // out tensor args
+    uint32_t out_tensor_addr                         = get_arg_val<uint32_t>(0);
+    uint32_t out_tensor_start_tile_id                = get_arg_val<uint32_t>(1);
+    uint32_t out_tensor_stride_w                     = get_arg_val<uint32_t>(2);
+    uint32_t out_tensor_stride_h                     = get_arg_val<uint32_t>(3);
+    uint32_t out_tensor_next_subblock_stride_w       = get_arg_val<uint32_t>(4);
+    uint32_t out_tensor_next_subblock_stride_h       = get_arg_val<uint32_t>(5);
+
+    // out subblock args
+    uint32_t out_subblock_w                   = get_arg_val<uint32_t>(6);
+    uint32_t out_subblock_h                   = get_arg_val<uint32_t>(7);
+    uint32_t out_subblock_tile_count          = get_arg_val<uint32_t>(8);
+    uint32_t out_num_subblocks_w              = get_arg_val<uint32_t>(9);
+    uint32_t out_num_subblocks_h              = get_arg_val<uint32_t>(10);
+
+    // const args for tile-based bank-swizzled layout
+    // could be added to the arg list in the future to test different
+    // bank-swizzling configurations
+    constexpr uint32_t num_used_dram_ch = 8;
+    constexpr uint32_t num_used_dram_ch_pow2_exponent = 3;
+    constexpr uint32_t tile_size_pow2_exponent = 11;
+
+    constexpr uint32_t cb_id_out0 = 16;
+
+    // single-tile
+    uint32_t single_tile_size_bytes = get_tile_size(cb_id_out0);
+
+    const InterleavedPow2AddrGen<true> s = {
+        .bank_base_address = out_tensor_addr,
+
+
+        .log_base_2_of_page_size = tile_size_pow2_exponent
+    };
+
+
+    bool one_time_profile = true;
+    uint32_t out_tensor_sbh_start_tile_id = out_tensor_start_tile_id;
+    for(uint32_t sbh = 0; sbh < out_num_subblocks_h; sbh++) {
+        uint32_t out_tensor_sbw_start_tile_id = out_tensor_sbh_start_tile_id;
+        for(uint32_t sbw = 0; sbw < out_num_subblocks_w; sbw++) {
+            uint32_t out_tensor_sb_row_start_tile_id = out_tensor_sbw_start_tile_id;
+
+            cb_wait_front(cb_id_out0, out_subblock_tile_count);
+            uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+
+            for(uint32_t h = 0; h < out_subblock_h; h++) {
+                uint32_t out_tensor_tile_id = out_tensor_sb_row_start_tile_id;
+                for(uint32_t w = 0; w < out_subblock_w; w++) {
+                    uint64_t out_tensor_tile_noc_addr = get_noc_addr(out_tensor_tile_id, s);
+
+                    // kernel_profiler::mark_time(9);
+                    noc_async_write(l1_read_addr, out_tensor_tile_noc_addr, single_tile_size_bytes);
+                    l1_read_addr+=single_tile_size_bytes;
+
+                    out_tensor_tile_id += out_tensor_stride_w;
+                }
+                out_tensor_sb_row_start_tile_id += out_tensor_stride_h;
+            }
+
+            noc_async_write_barrier();
+            cb_pop_front(cb_id_out0, out_subblock_tile_count);
+            out_tensor_sbw_start_tile_id += out_tensor_next_subblock_stride_w;
+        }
+        out_tensor_sbh_start_tile_id += out_tensor_next_subblock_stride_h;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp
new file mode 100644
index 00000000000..3d6404e8c71
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t dst_addr  = get_arg_val<uint32_t>(0);
+    uint32_t dst_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t dst_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t cb_id_out0 = 16;
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes = get_tile_size(cb_id_out0);
+    uint32_t ublock_size_tiles = 1;
+
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr);
+
+        cb_wait_front(cb_id_out0, ublock_size_tiles);
+        uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+        noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes);
+
+        noc_async_write_barrier();
+
+        cb_pop_front(cb_id_out0, ublock_size_tiles);
+        dst_addr += ublock_size_bytes;
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp
new file mode 100644
index 00000000000..c20798b1e6c
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+
+void kernel_main() {
+    uint32_t dst_addr  = get_arg_val<uint32_t>(0);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3); // Index 3 to match with regular writer_unary
+
+    constexpr uint32_t cb_id_out0 = 16;
+    constexpr uint32_t onetile = 1;
+    uint32_t tile_bytes = get_tile_size(cb_id_out0);
+
+    #ifdef KERNEL_COMPILE_TIME_ARG_0
+    constexpr bool write_to_dram = get_compile_time_arg_val(0);
+    #else
+    constexpr bool write_to_dram = true;
+    #endif
+
+    const InterleavedPow2AddrGen<write_to_dram> s = { dst_addr, 11 };
+
+    for (uint32_t i = 0; i<num_tiles; i ++) {
+        uint64_t dst_noc_addr = get_noc_addr(i, s);
+
+        cb_wait_front(cb_id_out0, onetile);
+        uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+
+        noc_async_write(l1_read_addr, dst_noc_addr, tile_bytes);
+
+        noc_async_write_barrier();
+
+        cb_pop_front(cb_id_out0, onetile);
+     }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_stick_layout_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_stick_layout_8bank.cpp
new file mode 100644
index 00000000000..db5a97140e1
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_stick_layout_8bank.cpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+
+    // Constexpr
+    constexpr uint32_t num_dram_channels               = 8;
+    constexpr uint32_t log_base_2_of_num_dram_channels = 3;
+    constexpr uint32_t cb_id_out0                      = 16;
+
+    uint32_t dst_addr                 = get_arg_val<uint32_t>(0);
+    uint32_t num_sticks               = get_arg_val<uint32_t>(1);
+    uint32_t stick_size               = get_arg_val<uint32_t>(2);
+
+    // TODO(agrebenisan): This isn't good... here we are assuming
+    // that the stick size dictates tiles c, but stick size
+    // doesn't necessarily need to be divisible by tiles c...
+    // this is only the case really for tilize
+    const uint32_t num_tiles_c = stick_size / 64; // Assuming 2 bytes per datum, there are 64 bytes per tile row
+    uint32_t stick_id          = 0;
+
+    const InterleavedAddrGen<true> s = {
+        .bank_base_address = dst_addr,
+
+
+        .page_size = stick_size
+    };
+
+    for (uint32_t i = 0; i < num_sticks / 32; i++) {
+        // We reserve back an entire tile row and issue a bunch of reads
+        cb_wait_front(cb_id_out0, num_tiles_c);
+        uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+        for (uint32_t j = 0; j < 32; j++) {
+            uint64_t dst_noc_addr = get_noc_addr(
+                stick_id, s);
+
+            uint32_t bank_id = stick_id & (num_dram_channels - 1);
+            noc_async_write(l1_read_addr, dst_noc_addr, stick_size);
+            l1_read_addr += stick_size;
+            stick_id++;
+        }
+        noc_async_write_barrier();
+        cb_pop_front(cb_id_out0, num_tiles_c);
+    }
+
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp
new file mode 100644
index 00000000000..ffa1bff29b2
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp
@@ -0,0 +1,51 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t dst_addr           = get_arg_val<uint32_t>(0);
+    uint32_t dst_noc_x          = get_arg_val<uint32_t>(1);
+    uint32_t dst_noc_y          = get_arg_val<uint32_t>(2);
+    uint32_t inner_r            = get_arg_val<uint32_t>(3);
+    uint32_t inner_c            = get_arg_val<uint32_t>(4);
+    uint32_t num_sub_blocks_m   = get_arg_val<uint32_t>(5);
+    uint32_t num_sub_blocks_n   = get_arg_val<uint32_t>(6);
+    uint32_t stride_r           = get_arg_val<uint32_t>(7);
+    uint32_t stride_subblock_r  = get_arg_val<uint32_t>(8);
+    uint32_t stride_subblock_c  = get_arg_val<uint32_t>(9);
+
+    constexpr uint32_t cb_id_out0 = 16;
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes = get_tile_size(cb_id_out0);
+    uint32_t ublock_size_tiles = 1;
+
+    uint32_t dram_address_block_row_beginning = dst_addr;
+    for(uint32_t sb_m = 0; sb_m < num_sub_blocks_m; sb_m++) {
+        uint32_t dram_address_block_beginning = dram_address_block_row_beginning;
+        for(uint32_t sb_n = 0; sb_n < num_sub_blocks_n; sb_n++) {
+            uint32_t dram_address_r = dram_address_block_beginning;
+            for(uint32_t r = 0; r < inner_r; r++) {
+                uint32_t dram_address_c = dram_address_r;
+                for(uint32_t c = 0; c < inner_c; c++) {
+                    uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dram_address_c);
+
+                    cb_wait_front(cb_id_out0, ublock_size_tiles);
+                    uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+
+                    noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes);
+
+                    noc_async_write_barrier();
+
+                    cb_pop_front(cb_id_out0, ublock_size_tiles);
+                    dram_address_c += ublock_size_bytes;
+                }
+                dram_address_r += stride_r; // goto next row within sub-block
+            }
+            dram_address_block_beginning += stride_subblock_c; // move to next sub-block on c dim
+        }
+        dram_address_block_row_beginning += stride_subblock_r; // move to next sub-block on r dim
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp
new file mode 100644
index 00000000000..56ba6456d00
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "debug_print.h"
+
+/**
+ * add two ints
+ * args are in L1
+ * result is in L1
+*/
+
+void kernel_main() {
+
+    volatile tt_l1_ptr std::uint32_t* arg_a = (volatile tt_l1_ptr uint32_t*)(L1_ARG_BASE);
+    volatile tt_l1_ptr std::uint32_t* arg_b = (volatile tt_l1_ptr uint32_t*)(L1_ARG_BASE + 4);
+    volatile tt_l1_ptr std::uint32_t* result = (volatile tt_l1_ptr uint32_t*)(L1_RESULT_BASE);
+
+    //Sample print statement
+    // DPRINT << 123;
+    result[0] = arg_a[0] + arg_b[0];
+
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/ping_legal_l1s.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/ping_legal_l1s.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/test_kernels/ping_legal_l1s.cpp
rename to tests/tt_metal/tt_metal/test_kernels/misc/ping_legal_l1s.cpp
diff --git a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp
index 829ba1a4505..b6229646f9b 100644
--- a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp
+++ b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp
@@ -68,7 +68,7 @@ int main(int argc, char **argv) {
 
                 auto l1_to_l1_kernel = tt_metal::CreateDataMovementKernel(
                         program,
-                        "tt_metal/kernels/dataflow/l1_to_l1.cpp",
+                        "tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp",
                         core,
                         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
diff --git a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
index 0e46656eb07..590b3d75274 100644
--- a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp
@@ -251,7 +251,7 @@ bool test_matmul_large_block(tt_metal::Device *device, bool activations_rm, bool
         std::vector<uint32_t> writer_rt_args;
         string writer_kernel;
         if (output_rm) {
-            writer_kernel = "tt_metal/kernels/dataflow/writer_unary.cpp";
+            writer_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp";
             writer_rt_args = {
                 dst_dram_buffer.address(),
                 (std::uint32_t)dram_dst_noc_xy.x,
@@ -259,7 +259,7 @@ bool test_matmul_large_block(tt_metal::Device *device, bool activations_rm, bool
                 uint(M * N)
             };
         } else {
-            writer_kernel = "tt_metal/kernels/dataflow/writer_unswizzle.cpp";
+            writer_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp";
             writer_rt_args = {
                 dst_dram_buffer.address(),
                 (std::uint32_t)dram_dst_noc_xy.x,
@@ -276,7 +276,7 @@ bool test_matmul_large_block(tt_metal::Device *device, bool activations_rm, bool
 
         auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
@@ -326,7 +326,7 @@ bool test_matmul_large_block(tt_metal::Device *device, bool activations_rm, bool
             uint(output_rm)
         };
 
-        string compute_kernel = "tt_metal/kernels/compute/matmul_large_block.cpp";
+        string compute_kernel = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp";
 
         auto mm_kernel = tt_metal::CreateComputeKernel(
             program,
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
index 9f69cc6ed08..3fd64d6ba6e 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
@@ -141,13 +141,13 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> create_pro
 
     auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_matmul_tile_layout.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -181,7 +181,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> create_pro
 
     auto mm_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
 
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
index 645bca0b1a2..ab7f83f8b85 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
@@ -154,19 +154,19 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID, tt_metal::
 
     auto mm_reader_kernel_sender = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout_in0_mcast_sender.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_sender.cpp",
         mcast_senders,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto mm_reader_kernel_receiver = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout_in0_mcast_receiver.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_mcast_receiver.cpp",
         mcast_receivers,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_matmul_tile_layout.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -201,7 +201,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID, tt_metal::
 
     auto mm_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
index 5b698b5a653..a1aab4b0fa1 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
@@ -170,37 +170,37 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID, tt_metal::
 
     auto mm_reader_kernel_in0_sender_in1_sender = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_sender.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_sender.cpp",
         in0_sender_in1_sender,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_0_default});
 
     auto mm_reader_kernel_in0_sender_in1_receiver = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_receiver.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_sender_in1_receiver.cpp",
         in0_sender_in1_receiver,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_0_default});
 
     auto mm_reader_kernel_in0_receiver_in1_sender = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_sender.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_sender.cpp",
         in0_receiver_in1_sender,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto mm_reader_kernel_in0_receiver_in1_receiver = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_receiver.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in0_receiver_in1_receiver.cpp",
         in0_receiver_in1_receiver,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel_noc0 = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_matmul_tile_layout.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp",
         all_except_left_column,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
     auto unary_writer_kernel_noc1 = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_matmul_tile_layout.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp",
         left_column,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_1_default});
 
@@ -235,7 +235,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID, tt_metal::
 
     auto mm_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
index 3e2bd7042e0..e3b61e5d62d 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
@@ -154,19 +154,19 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID, tt_metal::
 
     auto mm_reader_kernel_sender = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout_in1_mcast_sender.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_sender.cpp",
         mcast_senders,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto mm_reader_kernel_receiver = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_tile_layout_in1_mcast_receiver.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_tile_layout_in1_mcast_receiver.cpp",
         mcast_receivers,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_matmul_tile_layout.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_matmul_tile_layout.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -201,7 +201,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID, tt_metal::
 
     auto mm_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
index 5d3340ea553..c08b48a787f 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
@@ -203,13 +203,13 @@ std::tuple<tt_metal::Program, tt_metal::KernelID , tt_metal::KernelID> create_pr
 
     auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unswizzle.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -244,7 +244,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelID , tt_metal::KernelID> create_pr
 
     auto mm_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
index 72d7d91be91..ff9737f2ccf 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
@@ -165,7 +165,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) {
             .set_page_size(intermediate_cb_index, single_tile_size);
         auto cb_output = tt_metal::CreateCircularBuffer(program, cores, cb_output_config);
 
-        string reader_kernel = "tt_metal/kernels/dataflow/reader_matmul_with_bias_blocked.cpp";
+        string reader_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp";
 
         auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
@@ -175,7 +175,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) {
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -191,7 +191,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) {
         };
 
         string compute_kernel_name;
-        compute_kernel_name = "tt_metal/kernels/compute/matmul_with_bias.cpp";
+        compute_kernel_name = "tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp";
 
         auto mm_kernel = tt_metal::CreateComputeKernel(
             program,
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
index e54ffc8a0c4..4c756b69a1d 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
@@ -244,13 +244,13 @@ int main(int argc, char **argv) {
 
         auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unswizzle.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -285,7 +285,7 @@ int main(int argc, char **argv) {
 
         auto matmul_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
index b908c8b9c69..53cd1bda173 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
@@ -246,13 +246,13 @@ int main(int argc, char **argv) {
 
         auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unswizzle.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -287,7 +287,7 @@ int main(int argc, char **argv) {
 
         auto matmul_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/matmul_large_block_zm.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
index 706a0b64737..116de6ce101 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
@@ -71,13 +71,13 @@ int main(int argc, char **argv) {
 
         auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -92,7 +92,7 @@ int main(int argc, char **argv) {
         };
         auto mm_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/matmul.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
index 75809c84dd2..80a267f6225 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
@@ -71,13 +71,13 @@ int main(int argc, char **argv) {
 
         auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -93,7 +93,7 @@ int main(int argc, char **argv) {
 
         auto mm_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/matmul.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
index 8e6dea68504..40470d5e11f 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
@@ -71,13 +71,13 @@ int main(int argc, char **argv) {
 
         auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -93,7 +93,7 @@ int main(int argc, char **argv) {
 
         auto mm_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/matmul.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
index 20f7a1e9233..a1cb087f4aa 100644
--- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
+++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
@@ -47,19 +47,19 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> create_pro
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_unary_push_4.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
         all_cores,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
     auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = eltwise_unary_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_multiple_programs.cpp b/tests/tt_metal/tt_metal/test_multiple_programs.cpp
index 3ba77e2cc9c..f4387e244cc 100644
--- a/tests/tt_metal/tt_metal/test_multiple_programs.cpp
+++ b/tests/tt_metal/tt_metal/test_multiple_programs.cpp
@@ -56,13 +56,13 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> setup_prog
 
     auto binary_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_binary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -72,7 +72,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> setup_prog
     binary_defines["ELTWISE_OP"] = "add_tiles";
     auto eltwise_binary_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_binary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp",
         core,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines}
     );
@@ -109,13 +109,13 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> setup_prog
 
     auto mm_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_small_block.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -131,7 +131,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelID, tt_metal::KernelID> setup_prog
 
     auto mm_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/matmul.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp",
         core,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
     );
diff --git a/tests/tt_metal/tt_metal/test_reduce_h.cpp b/tests/tt_metal/tt_metal/test_reduce_h.cpp
index 187a2c151fb..61114b59fe5 100644
--- a/tests/tt_metal/tt_metal/test_reduce_h.cpp
+++ b/tests/tt_metal/tt_metal/test_reduce_h.cpp
@@ -111,15 +111,15 @@ int main(int argc, char **argv) {
         reader_defines["REDUCE_SCALER"] = "1";
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/reader_unary_transpose_wh_interleaved.cpp"
-                      : "tt_metal/kernels/dataflow/reader_unary_transpose_wh.cpp", // TODO(AP): not ported for reduce with scaler
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_interleaved.cpp"
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp", // TODO(AP): not ported for reduce with scaler
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default, .compile_args = reader_compile_args, .defines = reader_defines});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" // no need to transpose the output since output Ht=1
-                      : "tt_metal/kernels/dataflow/writer_unary.cpp",
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" // no need to transpose the output since output Ht=1
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -135,7 +135,7 @@ int main(int argc, char **argv) {
         };
         auto reduce_h_compute_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/reduce_h.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = reduce_defines}
         );
diff --git a/tests/tt_metal/tt_metal/test_reduce_hw.cpp b/tests/tt_metal/tt_metal/test_reduce_hw.cpp
index 6d8a7222b4c..fc66a3b7052 100644
--- a/tests/tt_metal/tt_metal/test_reduce_hw.cpp
+++ b/tests/tt_metal/tt_metal/test_reduce_hw.cpp
@@ -106,15 +106,15 @@ int main(int argc, char **argv) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/reader_unary_8bank_reduce.cpp"
-                      : "tt_metal/kernels/dataflow/reader_unary.cpp",
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank_reduce.cpp"
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp"
-                      : "tt_metal/kernels/dataflow/writer_unary.cpp",
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp"
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -131,7 +131,7 @@ int main(int argc, char **argv) {
         };
         auto reduce_hw_compute_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/reduce_hw.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = reduce_defines}
         );
diff --git a/tests/tt_metal/tt_metal/test_reduce_w.cpp b/tests/tt_metal/tt_metal/test_reduce_w.cpp
index cda9dc3bf93..5dbb3d3f953 100644
--- a/tests/tt_metal/tt_metal/test_reduce_w.cpp
+++ b/tests/tt_metal/tt_metal/test_reduce_w.cpp
@@ -104,15 +104,15 @@ int main(int argc, char **argv) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/reader_unary_8bank_reduce.cpp"
-                      : "tt_metal/kernels/dataflow/reader_unary.cpp",
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank_reduce.cpp"
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp"
-                      : "tt_metal/kernels/dataflow/writer_unary.cpp",
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp"
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -128,7 +128,7 @@ int main(int argc, char **argv) {
         };
         auto reduce_w_compute_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/reduce_w.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = reduce_defines}
         );
diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
index 292f1c9eb83..29fc74eab09 100644
--- a/tests/tt_metal/tt_metal/test_transpose_hc.cpp
+++ b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
@@ -90,16 +90,16 @@ int main(int argc, char **argv) {
         auto reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
             multibank ?
-                "tt_metal/kernels/dataflow/transpose_hc_8bank.cpp" :
-                "tt_metal/kernels/dataflow/transpose_hc.cpp",
+                "tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp" :
+                "tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
             multibank ?
-                "tt_metal/kernels/dataflow/writer_unary_8bank.cpp" :
-                "tt_metal/kernels/dataflow/writer_unary.cpp",
+                "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp" :
+                "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -109,7 +109,7 @@ int main(int argc, char **argv) {
 
         auto blank_binary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_copy.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_transpose_wh.cpp b/tests/tt_metal/tt_metal/test_transpose_wh.cpp
index c4a5d91c75e..4bfc3402f93 100644
--- a/tests/tt_metal/tt_metal/test_transpose_wh.cpp
+++ b/tests/tt_metal/tt_metal/test_transpose_wh.cpp
@@ -92,15 +92,15 @@ int main(int argc, char **argv) {
 
         auto reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            //"tt_metal/kernels/dataflow/reader_unary_transpose_wh.cpp",
-            "tt_metal/kernels/dataflow/reader_unary_transpose_wh_8bank.cpp",
+            //"tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh_8bank.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            //"tt_metal/kernels/dataflow/writer_unary.cpp",
-            "tt_metal/kernels/dataflow/writer_unary_8bank.cpp",
+            //"tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -110,7 +110,7 @@ int main(int argc, char **argv) {
 
         auto reduce_w_compute_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/transpose_wh.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_unpack_tilize.cpp b/tests/tt_metal/tt_metal/test_unpack_tilize.cpp
index 86b811f3361..23b93c13058 100644
--- a/tests/tt_metal/tt_metal/test_unpack_tilize.cpp
+++ b/tests/tt_metal/tt_metal/test_unpack_tilize.cpp
@@ -114,13 +114,13 @@ int main(int argc, char **argv) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary_push_4.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -131,7 +131,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/tilize.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_unpack_untilize.cpp b/tests/tt_metal/tt_metal/test_unpack_untilize.cpp
index dbc37237183..998307d1d92 100644
--- a/tests/tt_metal/tt_metal/test_unpack_untilize.cpp
+++ b/tests/tt_metal/tt_metal/test_unpack_untilize.cpp
@@ -128,13 +128,13 @@ int main(int argc, char **argv) {
 
         auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -145,7 +145,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/untilize.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/untilize.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args}
         );
diff --git a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
index 86dcfed15e8..7fe39006d22 100644
--- a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
@@ -148,15 +148,15 @@ int main(int argc, char **argv) {
 
         auto binary_reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/reader_dual_8bank.cpp"
-                      : "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp",
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp"
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary_diff_lengths.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            multibank ? "tt_metal/kernels/dataflow/writer_unary_8bank.cpp"
-                      : "tt_metal/kernels/dataflow/writer_unary.cpp",
+            multibank ? "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp"
+                      : "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
@@ -168,7 +168,7 @@ int main(int argc, char **argv) {
 
         auto eltwise_binary_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/untilA_elwbin_3m.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/untilA_elwbin_3m.cpp",
             core,
             tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = {{"ELTWISE_OP", op_id_to_op_define[eltwise_op]}}}
         );
diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
index 986e32b03c2..6eb2e8c3a3a 100644
--- a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
+++ b/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
@@ -50,13 +50,13 @@ tt_metal::Program generate_eltwise_unary_program(Device *device) {
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary_8bank.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
 
     auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_unary_8bank.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_8bank.cpp",
         core,
         tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
@@ -67,7 +67,7 @@ tt_metal::Program generate_eltwise_unary_program(Device *device) {
 
     auto eltwise_binary_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
         core,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
index b2d3990fc5b..33c9ac4656c 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
@@ -298,7 +298,7 @@ TEST_F(DeviceFixture, ValidateKernelDoesNotTargetHarvestedCores) {
         }
 
         tt_metal::Program program = tt_metal::Program();
-        string kernel_name = "tests/tt_metal/tt_metal/test_kernels/ping_legal_l1s.cpp";
+        string kernel_name = "tests/tt_metal/tt_metal/test_kernels/misc/ping_legal_l1s.cpp";
         CoreCoord logical_target_core = CoreCoord({.x = 0, .y = 0});
         uint32_t intermediate_l1_addr = L1_UNRESERVED_BASE;
         uint32_t size_bytes = host_input.size() * sizeof(uint32_t);
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
index a98382ba06a..95a0bfb3699 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
@@ -36,14 +36,14 @@ void initialize_and_compile_program(tt_metal::Device *device, tt_metal::Program
 
     auto unary_reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_unary_push_4.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
         core_range,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto unary_writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
         core_range,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
@@ -54,7 +54,7 @@ void initialize_and_compile_program(tt_metal::Device *device, tt_metal::Program
 
     auto eltwise_unary_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy_3m.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
         core_range,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
index 4f4ca69bb88..3987f63408e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
@@ -31,7 +31,7 @@ Program initialize_program_data_movement(Device *device, const CoreRangeSet &cor
 
     auto add_two_ints_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/riscv_draft/add_two_ints.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/riscv_draft/add_two_ints.cpp",
         core_range_set,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
@@ -51,7 +51,7 @@ Program initialize_program_compute(Device *device, const CoreRangeSet &core_rang
 
     auto compute_kernel_id = tt_metal::CreateComputeKernel(
         program,
-        "tests/tt_metal/tt_metal/test_kernels/increment_runtime_arg.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/increment_runtime_arg.cpp",
         core_range_set,
         tt_metal::ComputeConfig{.math_fidelity = MathFidelity::HiFi4, .fp32_dest_acc_en = fp32_dest_acc_en, .math_approx_mode = math_approx_mode, .compile_args = compute_args});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
index f59910052a5..bfb9953fceb 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
@@ -49,18 +49,18 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked
     std::vector<uint32_t> reader_runtime_args = {};
     std::vector<uint32_t> writer_runtime_args = {};
     if (banked_reader) {
-        reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp";
+        reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp";
         input_page_size_bytes = cfg.page_size_bytes;
     } else {
-        reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/direct_reader_unary.cpp";
+        reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp";
         input_page_size_bytes = cfg.size_bytes;
     }
 
     if (banked_writer) {
-        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp";
+        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp";
         output_page_size_bytes = cfg.page_size_bytes;
     } else {
-        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/direct_writer_unary.cpp";
+        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp";
         output_page_size_bytes = cfg.size_bytes;
     }
 
@@ -168,7 +168,7 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) {
 
     auto reader_kernel = CreateDataMovementKernel(
         program,
-        "tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp",
         cfg.logical_core,
         DataMovementConfig{
             .processor = DataMovementProcessor::RISCV_1,
@@ -177,7 +177,7 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) {
 
     auto writer_kernel = CreateDataMovementKernel(
         program,
-        "tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp",
         cfg.logical_core,
         DataMovementConfig{
             .processor = DataMovementProcessor::RISCV_0,
@@ -189,7 +189,7 @@ bool reader_datacopy_writer(Device* device, const BankedConfig& cfg) {
     };
     auto datacopy_kernel = CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
         cfg.logical_core,
         ComputeConfig{.compile_args = compute_kernel_args});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked_l1.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked_l1.cpp
index 615af0937b7..c709abb32d2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked_l1.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked_l1.cpp
@@ -49,18 +49,18 @@ bool l1_reader_cb_writer_l1(Device* device, const BankedL1Config& cfg, const boo
     std::vector<uint32_t> reader_runtime_args = {};
     std::vector<uint32_t> writer_runtime_args = {};
     if (banked_reader) {
-        reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp";
+        reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp";
         input_page_size_bytes = cfg.page_size_bytes;
     } else {
-        reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/direct_reader_unary.cpp";
+        reader_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp";
         input_page_size_bytes = cfg.size_bytes;
     }
 
     if (banked_writer) {
-        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp";
+        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp";
         output_page_size_bytes = cfg.page_size_bytes;
     } else {
-        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/direct_writer_unary.cpp";
+        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp";
         output_page_size_bytes = cfg.size_bytes;
     }
 
@@ -168,7 +168,7 @@ bool l1_reader_datacopy_l1_writer(Device* device, const BankedL1Config& cfg) {
 
     auto reader_kernel = CreateDataMovementKernel(
         program,
-        "tests/tt_metal/tt_metal/test_kernels/banked_reader.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_reader.cpp",
         cfg.logical_core,
         DataMovementConfig{
             .processor = DataMovementProcessor::RISCV_1,
@@ -177,7 +177,7 @@ bool l1_reader_datacopy_l1_writer(Device* device, const BankedL1Config& cfg) {
 
     auto writer_kernel = CreateDataMovementKernel(
         program,
-        "tests/tt_metal/tt_metal/test_kernels/banked_writer.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/banked_writer.cpp",
         cfg.logical_core,
         DataMovementConfig{
             .processor = DataMovementProcessor::RISCV_0,
@@ -189,7 +189,7 @@ bool l1_reader_datacopy_l1_writer(Device* device, const BankedL1Config& cfg) {
     };
     auto datacopy_kernel = CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
         cfg.logical_core,
         ComputeConfig{.compile_args = compute_kernel_args});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp
index 0750a69053b..6b2864599c7 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp
@@ -64,12 +64,12 @@ namespace tt::test::buffer::detail {
 
         auto reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::NOC_0, .compile_args = {cb_index}});
         auto writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp",
             core,
             tt_metal::DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::NOC_1, .compile_args = {cb_index}});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp
index 3d9839702a6..49b86d3b56d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp
+++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp
@@ -22,14 +22,14 @@ struct CBConfig {
 
 inline void initialize_program(Program& program, const CoreRangeSet& cr_set) {
     auto dummy_reader_kernel = CreateDataMovementKernel(
-        program, "tt_metal/kernels/dataflow/blank.cpp", cr_set,
+        program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set,
         DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
 
     auto dummy_writer_kernel = CreateDataMovementKernel(
-        program, "tt_metal/kernels/dataflow/blank.cpp", cr_set,
+        program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set,
         DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
 
-    auto dummy_compute_kernel = CreateComputeKernel(program, "tt_metal/kernels/compute/blank.cpp", cr_set);
+    auto dummy_compute_kernel = CreateComputeKernel(program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", cr_set);
 }
 
 }   // end namespace basic_tests::circular_buffer
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp
index 3f981af3334..2a33d54a7f6 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp
@@ -386,7 +386,7 @@ TEST_F(DeviceFixture, TestDataCopyWithUpdatedCircularBufferConfig) {
 
     auto reader_kernel = CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp",
         core,
         DataMovementConfig{
             .processor = DataMovementProcessor::RISCV_1,
@@ -395,7 +395,7 @@ TEST_F(DeviceFixture, TestDataCopyWithUpdatedCircularBufferConfig) {
 
     auto writer_kernel = CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp",
         core,
         DataMovementConfig{
             .processor = DataMovementProcessor::RISCV_0,
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/binary/single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/binary/single_core_binary_compute.cpp
index a8d5f2e3855..9447c20f9e2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/binary/single_core_binary_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/binary/single_core_binary_compute.cpp
@@ -79,14 +79,14 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig&
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_binary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp",
         test_config.core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
         test_config.core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
@@ -98,7 +98,7 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig&
         {"ELTWISE_OP", binary_op_name_to_op_kernel.at(test_config.binary_op)}};
     auto binary_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_binary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_binary.cpp",
         test_config.core,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = defines});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp
index 8aba23bcdc6..f982647a7b9 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/matmul/single_core_matmul_compute.cpp
@@ -210,14 +210,14 @@ bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig&
     std::vector<uint32_t> writer_rt_args;
     string writer_kernel_name;
     if (cfg.outputs_rm) {
-        writer_kernel_name = "tt_metal/kernels/dataflow/writer_unary.cpp";
+        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp";
         writer_rt_args = {
             (std::uint32_t)output_dram_byte_address,
             (std::uint32_t)output_dram_noc_xy.x,
             (std::uint32_t)output_dram_noc_xy.y,
             uint(cfg.M * cfg.N)};
     } else {
-        writer_kernel_name = "tt_metal/kernels/dataflow/writer_unswizzle.cpp";
+        writer_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp";
         writer_rt_args = {
             (std::uint32_t)output_dram_byte_address,
             (std::uint32_t)output_dram_noc_xy.x,
@@ -243,7 +243,7 @@ bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig&
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/reader_matmul_blocked.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp",
         cfg.core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
@@ -296,7 +296,7 @@ bool single_core_matmul(tt_metal::Device* device, const SingleCoreMatmulConfig&
 
     auto matmul_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/matmul_large_block.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp",
         cfg.core,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
 
@@ -412,7 +412,7 @@ bool single_tile_matmul(tt_metal::Device* device) {
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/reader_binary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp",
         core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_1,
@@ -421,7 +421,7 @@ bool single_tile_matmul(tt_metal::Device* device) {
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp",
         core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0,
@@ -430,7 +430,7 @@ bool single_tile_matmul(tt_metal::Device* device) {
 
     auto simple_matmul_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/single_tile_compute.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/single_tile_compute.cpp",
         core,
         tt_metal::ComputeConfig{.compile_args = {in0_cb_index, in1_cb_index, out_cb_index}});
 
@@ -537,7 +537,7 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp",
         core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_1,
@@ -546,7 +546,7 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp",
         core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0,
@@ -555,7 +555,7 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3
 
     auto simple_matmul_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/multi_tile_compute.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_tile_compute.cpp",
         core,
         tt_metal::ComputeConfig{
             .compile_args = {in0_cb_index, in1_cb_index, out_cb_index, M * K, K * N, M * N, M, N, K}});
@@ -682,7 +682,7 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp",
         core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_1,
@@ -691,7 +691,7 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp",
         core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0,
@@ -700,7 +700,7 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N
 
     auto simple_matmul_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/unit_tests/matmul/multi_block_compute.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/multi_block_compute.cpp",
         core,
         tt_metal::ComputeConfig{
             .compile_args = {
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests/compute/sfpu/sfpu_compute.cpp
index 859691dd580..aab0bb5c280 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/sfpu/sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/compute/sfpu/sfpu_compute.cpp
@@ -165,14 +165,14 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
 
         auto reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp",
             test_config.cores,
             tt_metal::DataMovementConfig{
                 .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
 
         auto writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             test_config.cores,
             tt_metal::DataMovementConfig{
                 .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
@@ -190,7 +190,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
 
         auto sfpu_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp",
             test_config.cores,
             tt_metal::ComputeConfig{
                 .math_approx_mode = test_config.approx_mode,
diff --git a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
index 36d15dc87e5..7aaa893772e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
@@ -48,7 +48,7 @@ bool reader_only(
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp",
         reader_core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
@@ -111,7 +111,7 @@ bool writer_only(
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp",
         writer_core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
@@ -181,7 +181,7 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp",
         test_config.core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_1,
@@ -190,7 +190,7 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp",
         test_config.core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0,
@@ -276,7 +276,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter
 
     auto reader_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp",
         test_config.core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_1,
@@ -285,7 +285,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter
 
     auto writer_kernel = tt_metal::CreateDataMovementKernel(
         program,
-        "tt_metal/kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp",
         test_config.core,
         tt_metal::DataMovementConfig{
             .processor = tt_metal::DataMovementProcessor::RISCV_0,
@@ -297,7 +297,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter
     };
     auto datacopy_kernel = tt_metal::CreateComputeKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_copy.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
         test_config.core,
         tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
index 01378d826ef..64a3e21ecb7 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
@@ -36,14 +36,14 @@ namespace local_test_functions {
 
 void initialize_dummy_kernels(Program& program, const CoreRangeSet& cr_set) {
     auto dummy_reader_kernel = CreateDataMovementKernel(
-        program, "tt_metal/kernels/dataflow/blank.cpp", cr_set,
+        program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set,
         DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
 
     auto dummy_writer_kernel = CreateDataMovementKernel(
-        program, "tt_metal/kernels/dataflow/blank.cpp", cr_set,
+        program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set,
         DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
 
-    auto dummy_compute_kernel = CreateComputeKernel(program, "tt_metal/kernels/compute/blank.cpp", cr_set);
+    auto dummy_compute_kernel = CreateComputeKernel(program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", cr_set);
 }
 
 bool cb_config_successful(Device* device, const DummyProgramMultiCBConfig & program_config){
@@ -203,7 +203,7 @@ bool test_dummy_EnqueueProgram_with_runtime_args(Device* device, CommandQueue& c
     auto dummy_kernel1 = CreateDataMovementKernel(
         program, "tests/tt_metal/tt_metal/gtest_unit_tests/command_queue/test_kernels/runtime_args_kernel1.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
 
-    auto dummy_compute_kernel = CreateComputeKernel(program, "tt_metal/kernels/compute/blank.cpp", cr_set);
+    auto dummy_compute_kernel = CreateComputeKernel(program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", cr_set);
 
     vector<uint32_t> dummy_kernel0_args = {0, 1, 2, 3, 4, 5, 6, 7, 8};
     vector<uint32_t> dummy_kernel1_args = {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
@@ -297,7 +297,7 @@ TEST_F(CommandQueueFixture, TestArbiterDoesNotHang) {
     // Add an NCRISC blank manually, but in compile program, the BRISC blank will be
     // added separately
     auto dummy_reader_kernel = CreateDataMovementKernel(
-        program, "tt_metal/kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
+        program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
 
     EnqueueProgram(*::detail::GLOBAL_CQ, program, false);
     Finish(*::detail::GLOBAL_CQ);
@@ -427,7 +427,7 @@ TEST_F(CommandQueueFixture, TestAutoInsertedBlankBriscKernelInDeviceDispatchMode
     // Add an NCRISC blank manually, but in compile program, the BRISC blank will be
     // added separately
     auto dummy_reader_kernel = CreateDataMovementKernel(
-        program, "tt_metal/kernels/dataflow/blank.cpp", cr_set,
+        program, "tests/tt_metal/tt_metal/test_kernels/dataflow/blank.cpp", cr_set,
         DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
 
     EnqueueProgram(*tt::tt_metal::detail::GLOBAL_CQ, program, false);
@@ -443,7 +443,7 @@ TEST_F(CommandQueueFixture, ComputeRuntimeArgs) {
 
     auto compute_kernel_id = CreateComputeKernel(
         program,
-        "tests/tt_metal/tt_metal/test_kernels/increment_runtime_arg.cpp",
+        "tests/tt_metal/tt_metal/test_kernels/compute/increment_runtime_arg.cpp",
         cr_set,
         tt::tt_metal::ComputeConfig{});
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
index a3d052431f7..44884966a4c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
@@ -166,7 +166,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
 
         auto reader_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/reader_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary.cpp",
             test_config.cores,
             tt_metal::DataMovementConfig{
                 .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default});
@@ -174,7 +174,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
         // Enqueue apis only supported on gs so far
         auto writer_kernel = tt_metal::CreateDataMovementKernel(
             program,
-            "tt_metal/kernels/dataflow/writer_unary.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary.cpp",
             test_config.cores,
             tt_metal::DataMovementConfig{
                 .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default});
@@ -192,7 +192,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
 
         auto sfpu_kernel = tt_metal::CreateComputeKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+            "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpu.cpp",
             test_config.cores,
             tt_metal::ComputeConfig{
                 .math_approx_mode = test_config.approx_mode,
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
index bd0458f7012..e78c56e36af 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
@@ -102,9 +102,9 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf
     for (int core_id = 0; core_id < num_cores; core_id++) {
         string receiver_kernel_name;
         if (core_id == 0) {
-            receiver_kernel_name = "tt_metal/kernels/dataflow/reader_first_stage.cpp";
+            receiver_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp";
         } else {
-            receiver_kernel_name = "tt_metal/kernels/dataflow/receiver_intermediate_stage.cpp";
+            receiver_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/receiver_intermediate_stage.cpp";
         }
 
         std::vector<uint32_t> receiver_kernel_compile_time_args = {cb_index, block_size_tiles};
@@ -119,9 +119,9 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf
 
         string sender_kernel_name;
         if (core_id == num_cores - 1) {
-            sender_kernel_name = "tt_metal/kernels/dataflow/writer_last_stage.cpp";
+            sender_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp";
         } else {
-            sender_kernel_name = "tt_metal/kernels/dataflow/sender_intermediate_stage.cpp";
+            sender_kernel_name = "tests/tt_metal/tt_metal/test_kernels/dataflow/sender_intermediate_stage.cpp";
         }
         std::vector<uint32_t> sender_kernel_compile_time_args = {cb_index, block_size_tiles};
         sender_kernels.push_back(tt_metal::CreateDataMovementKernel(
@@ -134,7 +134,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf
                 .compile_args = sender_kernel_compile_time_args}));
 
         // Add blank compute kernel
-        tt_metal::CreateComputeKernel(program, "tt_metal/kernels/compute/blank.cpp", cores[core_id]);
+        tt_metal::CreateComputeKernel(program, "tests/tt_metal/tt_metal/test_kernels/compute/blank.cpp", cores[core_id]);
     }
 
     // TODO(agrebenisan): Once semaphores are properly allocated at 16B-aligned addresses, then