diff --git a/.clang-format b/.clang-format
index 71699bcaa49..6d87dcb186f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,7 +1,7 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-AccessModifierOffset: -1
+AccessModifierOffset: -4
 AlignAfterOpenBracket: AlwaysBreak
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
@@ -51,7 +51,7 @@ ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
-DerivePointerAlignment: true
+DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
@@ -93,7 +93,8 @@ PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Right
+PointerAlignment: Left
+ReferenceAlignment: Left
 RawStringFormats:
   - Language:        Cpp
     Delimiters:
@@ -123,7 +124,7 @@ RawStringFormats:
     CanonicalDelimiter: ''
     BasedOnStyle:    google
 ReflowComments:  true
-SortIncludes:    true
+SortIncludes:    false
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
diff --git a/.github/workflows/publish-release-image-wrapper.yaml b/.github/workflows/publish-release-image-wrapper.yaml
new file mode 100644
index 00000000000..d6b0e20fcf4
--- /dev/null
+++ b/.github/workflows/publish-release-image-wrapper.yaml
@@ -0,0 +1,12 @@
+name: "Create and Publish Release Docker Image"
+
+on:
+  workflow_call:
+  workflow_dispatch:
+
+jobs:
+  to_be_filled_out:
+    steps:
+      - name: This workflow will be filled out in https://github.com/tenstorrent/tt-metal/pull/15013
+        run: |
+          echo "NOOP"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e17c82caf41..a6f780e25bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -207,6 +207,7 @@ string(TOUPPER "$ENV{ARCH_NAME}" ARCH_NAME_DEF)
 add_compile_definitions(ARCH_${ARCH_NAME_DEF})
 add_compile_options(
     -Werror
+    -Wno-deprecated-declarations
     -Wdelete-non-virtual-dtor
     -Wreturn-type
     -Wswitch
@@ -226,9 +227,7 @@ add_compile_options(
     "$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-this-capture>"
     "$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-volatile>"
     "$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-builtins>"
-    "$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-declarations>"
     "$<$<CXX_COMPILER_ID:GNU>:-Wno-deprecated>"
-    "$<$<CXX_COMPILER_ID:GNU>:-Wno-deprecated-declarations>"
     "$<$<CXX_COMPILER_ID:GNU>:-Wno-attributes>"
     "$<$<CXX_COMPILER_ID:GNU>:-Wno-stringop-overread>"
     "$<$<CXX_COMPILER_ID:GNU>:-Wno-stringop-overflow>"
diff --git a/CODEOWNERS b/CODEOWNERS
index 557ef649ffe..ee732111fc3 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -53,7 +53,7 @@ tests/scripts/tgg/ @afuller-TT @ttmchiou
 tt_metal/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema
 tt_metal/host_api.hpp @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @davorchap
 tt_metal/impl/device/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @davorchap @cfjchu
-tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal
+tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal @omilyutin-tt
 tt_metal/**/requirements*.txt @tt-rkim @TT-billteng @ttmchiou
 
 # metal - dispatch
@@ -105,13 +105,13 @@ ttnn/cpp/ttnn/operations/ccl/ @SeanNijjar @cfjchu
 ttnn/cpp/ttnn/operations/pool/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
 ttnn/cpp/ttnn/operations/conv/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic @bbradelTT
 ttnn/cpp/ttnn/operations/sliding_window/ @mywoodstock @sankarmanoj-tt @pavlejosipovic
-ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @jaykru-tt @yugi957
+ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @jaykru-tt @yugi957 @jvegaTT @llongTT
 ttnn/cpp/ttnn/operations/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
 ttnn/cpp/ttnn/operations/experimental/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
 ttnn/cpp/ttnn/operations/eltwise/ @patrickroberts @yan-zaretskiy @eyonland
-ttnn/cpp/ttnn/operations/reduction/ @SeanNijjar @ntarafdar @sjameelTT
+ttnn/cpp/ttnn/operations/reduction/ @bbradelTT @asandhupatlaTT @sjameelTT
 ttnn/cpp/ttnn/operations/normalization/ @yugaoTT @tt-aho
-ttnn/cpp/ttnn/operations/embedding/ @ntarafdar @tt-aho @TT-BrianLiu
+ttnn/cpp/ttnn/operations/embedding/ @ntarafdar @tt-aho @TT-BrianLiu @yugi957 @sjameelTT @jaykru-tt @llongTT
 ttnn/cpp/ttnn/operations/embedding_backward/ @TT-BrianLiu @yan-zaretskiy
 ttnn/ttnn/operations/eltwise @patrickroberts @yan-zaretskiy @eyonland
 tests/ttnn/ @ayerofieiev-tt @dmakoviichuk-tt @rfurko-tt @cfjchu @TT-BrianLiu @razorback3 @dongjin-na
@@ -122,12 +122,12 @@ tests/sweep_framework/ @xanderchin @jdesousa-TT @sjameelTT
 tests/sweep_framework/sweeps
 tests/sweep_framework/sweeps/eltwise/ @patrickroberts @yan-zaretskiy @eyonland
 tests/sweep_framework/sweeps/conv2d/  @nkpatel-tt @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
-tests/sweep_framework/sweeps/data_movement/  @sjameelTT @ntarafdar @jaykru-tt @yugi957
+tests/sweep_framework/sweeps/data_movement/  @sjameelTT @ntarafdar @jaykru-tt @yugi957 @llongTT @jvegaTT
 
 # TTNN Distributed
-ttnn/cpp/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
-ttnn/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
-tests/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
+ttnn/cpp/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt
+ttnn/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt
+tests/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt
 
 # models
 /models/ @tt-rkim @uaydonat
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 921f8f8d16b..7b9fc84dca8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -251,8 +251,8 @@ The new fangled way we run our tests is with Googletest. The way we generally
 structure our tests with this framework is to bundle it into a single
 executable.
 
-You can use `--gtest_filter_test` to filter out the specific test you'd like.
-For example, to build and run the `CommonFixture.DRAMLoopbackSingleCore` on
+You can use `--gtest_filter` to filter out the specific test you'd like.
+For example, to build and run the `DispatchFixture.TensixDRAMLoopbackSingleCore` on
 fast dispatch, you can
 
 1. Build the unit tests:
@@ -261,7 +261,7 @@ fast dispatch, you can
    ```
 2. Run the test:
    ```
-   ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommonFixture.DRAMLoopbackSingleCore"
+   ./build/test/tt_metal/unit_tests_api --gtest_filter="DispatchFixture.TensixDRAMLoopbackSingleCore"
    ```
 
 On slow dispatch, to run another specific test, the equivalent would be:
@@ -270,7 +270,7 @@ On slow dispatch, to run another specific test, the equivalent would be:
 2. Run with the slow dispatch mode:
    ```
    export TT_METAL_SLOW_DISPATCH_MODE=1
-   ./build/test/tt_metal/unit_tests/fast_dispatch --gtest_filter_test="BasicFixture.TestL1BuffersAllocatedTopDown"
+   ./build/test/tt_metal/unit_tests/unit_tests_api --gtest_filter="DeviceSingleCardBufferFixture.TestL1BuffersAllocatedTopDown"
    ```
 
 We have split our tests into the two dispatch modes for less pollution of state
diff --git a/METALIUM_GUIDE.md b/METALIUM_GUIDE.md
index a68f5ba129e..cd02547193a 100644
--- a/METALIUM_GUIDE.md
+++ b/METALIUM_GUIDE.md
@@ -128,26 +128,26 @@ void MAIN {
   mm_init();
   acquire_dst();
 
-  cb_wait_front(tt::CB::c_in0, /* number of tiles */ 1);
-  cb_wait_front(tt::CB::c_in1, /* number of tiles */ 1);
+  cb_wait_front(tt::CBIndex::c_0, /* number of tiles */ 1);
+  cb_wait_front(tt::CBIndex::c_1, /* number of tiles */ 1);
 
-  matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
+  matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
 
-  cb_pop_front(tt::CB::c_in1, /* number of tiles */ 1);
-  cb_pop_front(tt::CB::c_in0, /* number of tiles */ 1);
+  cb_pop_front(tt::CBIndex::c_1, /* number of tiles */ 1);
+  cb_pop_front(tt::CBIndex::c_0, /* number of tiles */ 1);
 
-  cb_reserve_back(tt::CB::c_out0, /* number of tiles */ 1);
-  pack_tile(0, tt::CB::c_out0);
-  cb_push_back(tt::CB::c_out0, /* number of tiles */ 1);
+  cb_reserve_back(tt::CBIndex::c_16, /* number of tiles */ 1);
+  pack_tile(0, tt::CBIndex::c_16);
+  cb_push_back(tt::CBIndex::c_16, /* number of tiles */ 1);
 
   release_dst();
 }
 }  // namespace NAMESPACE
 ```
 
-It takes two matrix tiles from `tt::CB::c_in0` and `tt::CB::c_in0` L1 and
+It takes two matrix tiles from `tt::CBIndex::c_0` and `tt::CBIndex::c_0` L1 and
 conducts a single-tile matrix multiplication. Finally, it packs the result to
-`tt::CB::c_out0`.
+`tt::CBIndex::c_16`.
 
 Note that tile registers are acquired by `acquire_dst()`, but actually we can
 use `tile_regs_..()` functions for the more fine-grained tile register lock
@@ -299,23 +299,23 @@ namespace NAMESPACE {
 void MAIN {
   mm_init();
 
-  cb_wait_front(tt::CB::c_in0, /* number of tiles */ 1);
-  cb_wait_front(tt::CB::c_in1, /* number of tiles */ 1);
+  cb_wait_front(tt::CBIndex::c_0, /* number of tiles */ 1);
+  cb_wait_front(tt::CBIndex::c_1, /* number of tiles */ 1);
 
   tile_regs_acquire();
 
-  matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
+  matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
 
   tile_regs_commit();
 
-  cb_pop_front(tt::CB::c_in1, /* number of tiles */ 1);
-  cb_pop_front(tt::CB::c_in0, /* number of tiles */ 1);
+  cb_pop_front(tt::CBIndex::c_1, /* number of tiles */ 1);
+  cb_pop_front(tt::CBIndex::c_0, /* number of tiles */ 1);
 
   tile_regs_wait();
 
-  cb_reserve_back(tt::CB::c_out0, /* number of tiles */ 1);
-  pack_tile(0, tt::CB::c_out0);
-  cb_push_back(tt::CB::c_out0, /* number of tiles */ 1);
+  cb_reserve_back(tt::CBIndex::c_16, /* number of tiles */ 1);
+  pack_tile(0, tt::CBIndex::c_16);
+  cb_push_back(tt::CBIndex::c_16, /* number of tiles */ 1);
 
   tile_regs_release();
 }
@@ -367,9 +367,9 @@ void MAIN {
     uint32_t per_core_block_cnt = get_arg_val<uint32_t>(0);
     uint32_t per_core_block_size = get_arg_val<uint32_t>(1); // should be <= 8 in this kernel
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 =  tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 =  tt::CBIndex::c_16;
 
     binary_op_init_common(cb_in0, cb_in1, cb_out0);
     add_tiles_init();
@@ -400,7 +400,7 @@ void MAIN {
         cb_pop_front(cb_in0, per_core_block_size);
         cb_pop_front(cb_in1, per_core_block_size);
 
-        // push a block of tiles to output CB
+        // push a block of tiles to output CBIndex
         cb_push_back(cb_out0, per_core_block_size);
     }
 
diff --git a/docs/source/common/images/MFB-Fig12.png b/docs/source/common/images/MFB-Fig12.png
deleted file mode 100644
index 59b3c51928d..00000000000
Binary files a/docs/source/common/images/MFB-Fig12.png and /dev/null differ
diff --git a/docs/source/common/images/MFB-Fig3a.png b/docs/source/common/images/MFB-Fig3a.png
deleted file mode 100644
index ebba343a18d..00000000000
Binary files a/docs/source/common/images/MFB-Fig3a.png and /dev/null differ
diff --git a/docs/source/common/images/MFB-Fig11.png b/docs/source/common/images/MfB-Fig11.png
similarity index 100%
rename from docs/source/common/images/MFB-Fig11.png
rename to docs/source/common/images/MfB-Fig11.png
diff --git a/docs/source/common/images/MfB-Fig12.png b/docs/source/common/images/MfB-Fig12.png
index 8b3c6dd3049..59b3c51928d 100644
Binary files a/docs/source/common/images/MfB-Fig12.png and b/docs/source/common/images/MfB-Fig12.png differ
diff --git a/docs/source/common/images/MFB-Fig2.png b/docs/source/common/images/MfB-Fig2.png
similarity index 100%
rename from docs/source/common/images/MFB-Fig2.png
rename to docs/source/common/images/MfB-Fig2.png
diff --git a/docs/source/common/images/MfB-Fig3a.png b/docs/source/common/images/MfB-Fig3a.png
new file mode 100644
index 00000000000..d3f5a12faa9
--- /dev/null
+++ b/docs/source/common/images/MfB-Fig3a.png
@@ -0,0 +1 @@
+
diff --git a/docs/source/tt-metalium/tools/kernel_print.rst b/docs/source/tt-metalium/tools/kernel_print.rst
index be009a0cd9b..932b4e8f613 100644
--- a/docs/source/tt-metalium/tools/kernel_print.rst
+++ b/docs/source/tt-metalium/tools/kernel_print.rst
@@ -83,7 +83,7 @@ Data from Circular Buffers can be printed using the ``TileSlice`` object. It can
 | print_untilized | bool                | Whether to untilize the CB data while printing it (always done for block float formats), default ``true``.                                                   |
 +-----------------+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
-An example of how to print data from a CB (in this case, ``CB::c_intermed1``) is shown below.  Note that sampling happens relative
+An example of how to print data from a CB (in this case, ``CBIndex::c_25``) is shown below.  Note that sampling happens relative
 to the current CB read or write pointer. This means that for printing a tile read from the front of the CB, the
 ``DPRINT`` call has to occur between the ``cb_wait_front`` and ``cb_pop_front`` calls. For printing a tile from the
 back of the CB, the ``DPRINT`` call has to occur between the ``cb_reserve_back`` and ``cb_push_back`` calls. Currently supported data
@@ -94,15 +94,15 @@ formats for printing from CBs are ``DataFormat::Float32``, ``DataFormat::Float16
     #include "debug/dprint.h"  // required in all kernels using DPRINT
 
     void kernel_main() {
-        // Assuming the tile we want to print from CB::c_intermed1 is from the front the CB, print must happen after
+        // Assuming the tile we want to print from CBIndex::c_25 is from the front the CB, print must happen after
         // this call. If the tile is from the back of the CB, then print must happen after cb_reserve_back().
-        cb_wait_front(CB::c_intermed1, 1);
+        cb_wait_front(CBIndex::c_25, 1);
         ...
 
-        // Extract a numpy slice `[0:32:16, 0:32:16]` from tile `0` from `CB::c_intermed1` and print it.
-        DPRINT << TSLICE(CB::c_intermed1, 0, SliceRange::hw0_32_16()) << ENDL();
+        // Extract a numpy slice `[0:32:16, 0:32:16]` from tile `0` from `CBIndex::c_25` and print it.
+        DPRINT << TSLICE(CBIndex::c_25, 0, SliceRange::hw0_32_16()) << ENDL();
         // Note that since the MATH core does not have access to CBs, so this is an invalid print:
-        DPRINT_MATH({ DPRINT  << TSLICE(CB::c_intermed1, 0, SliceRange::hw0_32_16()) << ENDL(); }); // Invalid
+        DPRINT_MATH({ DPRINT  << TSLICE(CBIndex::c_25, 0, SliceRange::hw0_32_16()) << ENDL(); }); // Invalid
 
         // Print a full tile
         for (int32_t r = 0; r < 32; ++r) {
@@ -118,5 +118,5 @@ formats for printing from CBs are ``DataFormat::Float32``, ``DataFormat::Float16
         }
 
         ...
-        cb_pop_front(CB::c_intermed1, 1);
+        cb_pop_front(CBIndex::c_25, 1);
     }
diff --git a/docs/source/tt-metalium/tt_metal/examples/eltwise_binary.rst b/docs/source/tt-metalium/tt_metal/examples/eltwise_binary.rst
index 749834b28d7..ffd1ddd4da7 100644
--- a/docs/source/tt-metalium/tt_metal/examples/eltwise_binary.rst
+++ b/docs/source/tt-metalium/tt_metal/examples/eltwise_binary.rst
@@ -32,19 +32,19 @@ We already have set the circular buffers needed for compute data communication.
 
 .. code-block:: cpp
 
-  constexpr uint32_t src0_cb_index = CB::c_in0;
+  constexpr uint32_t src0_cb_index = CBIndex::c_0;
   constexpr uint32_t src0_cb_addr = 200 * 1024;
   constexpr uint32_t num_input_tiles = 2;
   constexpr uint32_t input_cb_size = num_input_tiles * single_tile_size;
   CircularBufferConfig cb_src0_config = CircularBufferConfig(input_cb_size, {{src0_cb_index, tt::DataFormat::Float16_b}}, src0_cb_addr).set_page_size(src0_cb_index, single_tile_size);
   CBHandle cb_src0 = v0::CreateCircularBuffer(program, core, cb_src0_config);
 
-  constexpr uint32_t src1_cb_index = CB::c_in1;
+  constexpr uint32_t src1_cb_index = CBIndex::c_1;
   constexpr uint32_t src1_cb_addr = 300 * 1024;
   CircularBufferConfig cb_src1_config = CircularBufferConfig(input_cb_size, {{src1_cb_index, tt::DataFormat::Float16_b}}, src1_cb_addr).set_page_size(src1_cb_index, single_tile_size);
   CBHandle cb_src1 = v0::CreateCircularBuffer(program, core, cb_src1_config);
 
-  constexpr uint32_t output_cb_index = CB::c_out0;
+  constexpr uint32_t output_cb_index = CBIndex::c_16;
   constexpr uint32_t output_cb_addr = 400 * 1024;
   constexpr uint32_t num_output_tiles = 2;
   constexpr uint32_t input_cb_size = num_input_tiles * single_tile_size;
diff --git a/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst b/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst
index 09640cbd571..8749e64a442 100644
--- a/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst
+++ b/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst
@@ -31,12 +31,12 @@ compute, and writer engines.
 
 .. code-block:: cpp
 
-    constexpr uint32_t src0_cb_index = CB::c_in0;
+    constexpr uint32_t src0_cb_index = CBIndex::c_0;
     constexpr uint32_t num_input_tiles = 2;
     CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
     CBHandle cb_src0 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src0_config);
 
-    constexpr uint32_t output_cb_index = CB::c_out0;
+    constexpr uint32_t output_cb_index = CBIndex::c_16;
     constexpr uint32_t num_output_tiles = 2;
     CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
     CBHandle cb_output = tt_metal::v0::CreateCircularBuffer(program, core, cb_output_config);
diff --git a/docs/source/tt-metalium/tt_metal/examples/matmul_multi_core_optimizations/data_reuse.rst b/docs/source/tt-metalium/tt_metal/examples/matmul_multi_core_optimizations/data_reuse.rst
index dd86afdcae3..e6495c354e3 100644
--- a/docs/source/tt-metalium/tt_metal/examples/matmul_multi_core_optimizations/data_reuse.rst
+++ b/docs/source/tt-metalium/tt_metal/examples/matmul_multi_core_optimizations/data_reuse.rst
@@ -40,8 +40,8 @@ In addition to our double-buffer config, we introduce a third circular buffer de
 
 .. code-block:: cpp
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
-    uint32_t interm0_cb_index = 24; // Index for the intermediate circular buffer
+    uint32_t output_cb_index = CBIndex::c_16;
+    uint32_t interm0_cb_index = CBIndex::c_24; // Index for the intermediate circular buffer
     std::map<uint8_t, tt::DataFormat> output_cb_data_format_spec {
         {output_cb_index, cb_data_format}, // Output buffer configuration
         {interm0_cb_index, cb_data_format} // Intermediate buffer configuration
@@ -173,16 +173,16 @@ a. **Preparing the Intermediate Buffer**:
 
     .. code-block:: cpp
 
-        cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+        cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
 
     - **Storing Partial Results**: Partial results are stored via a packing mechanism with ``pack_tile(...)`` into the above reserved space.
 
     .. code-block:: cpp
 
         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-            pack_tile(i, tt::CB::c_intermed0);
+            pack_tile(i, tt::CBIndex::c_24);
         }
-        cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+        cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
 
 b. **Computing with Partial Results**:
 
@@ -191,11 +191,11 @@ b. **Computing with Partial Results**:
     .. code-block:: cpp
 
         if (enable_reload) {
-            cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+            cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
             for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                copy_tile(tt::CB::c_intermed0, i, i);
+                copy_tile(tt::CBIndex::c_24, i, i);
             }
-            cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+            cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
         }
 
     - **Execution with `matmul_tiles`**: Now we are ready to compute partial results and integrate them back into the computation stream (or for the last block of computation, culminate our data reuse to produce the final output tensor).  We call the ``matmul_tiles(...)`` function to execute our matmul on the core's subblocks of tiles.
@@ -211,7 +211,7 @@ b. **Computing with Partial Results**:
                 for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
                     int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
                     int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
-                    matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                    matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                     in1_index_inner_dim_offset += in1_per_core_w;
                 }
                 dst_index++;
diff --git a/docs/source/tt-metalium/tt_metal/examples/matmul_single_core.rst b/docs/source/tt-metalium/tt_metal/examples/matmul_single_core.rst
index 3c6984e009b..94515c87f19 100644
--- a/docs/source/tt-metalium/tt_metal/examples/matmul_single_core.rst
+++ b/docs/source/tt-metalium/tt_metal/examples/matmul_single_core.rst
@@ -147,18 +147,18 @@ double buffering..
 
 .. code-block:: cpp
 
-    uint32_t src0_cb_index = CB::c_in0; //0
+    uint32_t src0_cb_index = CBIndex::c_0; //0
     uint32_t num_input_tiles = 2;
     tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
         .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t src1_cb_index = CB::c_in1; // 1
+    uint32_t src1_cb_index = CBIndex::c_1; // 1
     tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
         .set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src1_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
         .set_page_size(output_cb_index, single_tile_size);
diff --git a/models/demos/distilbert/tests/test_perf_distilbert.py b/models/demos/distilbert/tests/test_perf_distilbert.py
index 8cb8c99e59f..f3b0a6373fa 100644
--- a/models/demos/distilbert/tests/test_perf_distilbert.py
+++ b/models/demos/distilbert/tests/test_perf_distilbert.py
@@ -152,7 +152,7 @@ def test_distilbert_perf_device(batch_size, test, reset_seeds):
     margin = 0.03
     num_iterations = 1
     if is_grayskull():
-        expected_perf = 40.8772
+        expected_perf = 57.3
     elif is_wormhole_b0():
         expected_perf = 103.884
 
diff --git a/models/demos/llama3/tt/llama_attention.py b/models/demos/llama3/tt/llama_attention.py
index 86c3865c57d..d630e91a3bd 100644
--- a/models/demos/llama3/tt/llama_attention.py
+++ b/models/demos/llama3/tt/llama_attention.py
@@ -356,7 +356,7 @@ def forward_decode(
         if self.is_multichip and not self.use_fused_all_gather_matmul:
             dense_out_reduced = ttnn.reduce_scatter(
                 dense_out_sharded,
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,
                 memory_config=self.model_config[
@@ -532,7 +532,7 @@ def forward_prefill(self, x_11SH, rot_mats, transformation_mats, user_id: int =
         if self.is_multichip and not self.use_fused_all_gather_matmul:
             dense_out_reduced = ttnn.reduce_scatter(
                 output_11SH,
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,
diff --git a/models/demos/llama3/tt/llama_mlp.py b/models/demos/llama3/tt/llama_mlp.py
index c36f0a0845b..1d18953b5d4 100644
--- a/models/demos/llama3/tt/llama_mlp.py
+++ b/models/demos/llama3/tt/llama_mlp.py
@@ -133,7 +133,7 @@ def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor:
         if self.args.is_multichip:
             w2_out_reduced = ttnn.reduce_scatter(
                 w2_out,
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,
                 memory_config=w2_out.memory_config(),
diff --git a/models/demos/llama3/tt/multimodal/llama_cross_attention.py b/models/demos/llama3/tt/multimodal/llama_cross_attention.py
index 71fe78f6de9..d7032fd59ba 100644
--- a/models/demos/llama3/tt/multimodal/llama_cross_attention.py
+++ b/models/demos/llama3/tt/multimodal/llama_cross_attention.py
@@ -271,7 +271,7 @@ def forward_decode(self, x_11SH, xattn_mask, full_text_row_masked_out_mask_1NSH,
         if self.is_multichip:
             output = ttnn.reduce_scatter(
                 output,
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,
@@ -357,7 +357,7 @@ def forward_prefill(
         if self.is_multichip:  # TODO use_fused_all_gather_matmul
             dense_out_reduced = ttnn.reduce_scatter(
                 output,
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,
diff --git a/models/demos/qwen/tt/qwen_attention.py b/models/demos/qwen/tt/qwen_attention.py
index ba598cc96c1..0e80c47b228 100644
--- a/models/demos/qwen/tt/qwen_attention.py
+++ b/models/demos/qwen/tt/qwen_attention.py
@@ -414,7 +414,7 @@ def forward_decode(
         if self.is_multichip and not self.use_fused_all_gather_matmul:
             dense_out_reduced = ttnn.reduce_scatter(
                 dense_out,
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,
                 memory_config=ttnn.L1_MEMORY_CONFIG,
@@ -598,7 +598,7 @@ def forward_prefill(self, x_11SH, rot_mats, transformation_mats, user_id: int =
         if self.is_multichip and not self.use_fused_all_gather_matmul:
             dense_out_reduced = ttnn.reduce_scatter(
                 output_11SH,
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG,
diff --git a/models/demos/qwen/tt/qwen_mlp.py b/models/demos/qwen/tt/qwen_mlp.py
index e07d4943d1c..ad500853920 100644
--- a/models/demos/qwen/tt/qwen_mlp.py
+++ b/models/demos/qwen/tt/qwen_mlp.py
@@ -142,7 +142,7 @@ def forward(self, x: ttnn.Tensor, mode) -> ttnn.Tensor:
         if self.args.is_multichip:
             w2_out_reduced = ttnn.reduce_scatter(
                 w2_out,
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,
                 memory_config=ttnn.DRAM_MEMORY_CONFIG if mode == "prefill" else ttnn.L1_MEMORY_CONFIG,
diff --git a/models/demos/t3000/falcon40b/tt/falcon_mlp.py b/models/demos/t3000/falcon40b/tt/falcon_mlp.py
index 1788c3ac6b6..5101b309d4d 100644
--- a/models/demos/t3000/falcon40b/tt/falcon_mlp.py
+++ b/models/demos/t3000/falcon40b/tt/falcon_mlp.py
@@ -124,7 +124,7 @@ def fwd_decode(self, x: List[ttnn.Tensor]) -> List[ttnn.Tensor]:
         hidden_states = ttnn.get_device_tensors(
             ttnn.reduce_scatter(
                 ttnn.aggregate_as_tensor(hidden_states),
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,  # only unidirectional supported for now
                 memory_config=self.model_config["DEFAULT_MEMCFG"],
@@ -200,7 +200,7 @@ def fwd_prefill(self, x: List[ttnn.Tensor]) -> List[ttnn.Tensor]:
         hidden_states = ttnn.get_device_tensors(
             ttnn.reduce_scatter(
                 ttnn.aggregate_as_tensor(hidden_states),
-                scatter_dim=3,
+                dim=3,
                 math_op=ttnn.ReduceType.Sum,
                 num_links=1,  # only one link supported for now
                 memory_config=self.model_config["DEFAULT_MEMCFG"],
diff --git a/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py b/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py
index 2861253da1a..a185fc605f0 100644
--- a/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py
+++ b/models/demos/t3000/llama2_70b/tt/llama_mlp_optimized.py
@@ -219,7 +219,7 @@ def prefill_forward(self, x: List[ttnn.Tensor]) -> List[ttnn.Tensor]:
 
         hidden_states_reduced = ttnn.reduce_scatter(
             hidden_states_mm,
-            scatter_dim=3,
+            dim=3,
             math_op=ttnn.ReduceType.Sum,
             num_links=1,
             memory_config=ttnn.DRAM_MEMORY_CONFIG,
@@ -268,7 +268,7 @@ def decode_forward(self, x: List[ttnn.Tensor]) -> List[ttnn.Tensor]:
 
         hidden_states_reduced = ttnn.reduce_scatter(
             hidden_states,
-            scatter_dim=3,
+            dim=3,
             math_op=ttnn.ReduceType.Sum,
             num_links=1,
             memory_config=self.model_config["RESIDUAL_16_CORES_OUTPUT_MEMCFG"],
diff --git a/models/demos/tg/llama3_70b/tt/llama_common.py b/models/demos/tg/llama3_70b/tt/llama_common.py
index 1b16fde6a60..9824afbc44c 100644
--- a/models/demos/tg/llama3_70b/tt/llama_common.py
+++ b/models/demos/tg/llama3_70b/tt/llama_common.py
@@ -93,7 +93,7 @@ def tt_composite_sharded_all_reduce(
     input_mem_cfg = input_tensor.memory_config()
     reduce_scattered_tensor = ttnn.reduce_scatter(
         input_tensor,
-        scatter_dim=dim,
+        dim=dim,
         math_op=ttnn.ReduceType.Sum,
         num_links=num_links,
         cluster_axis=cluster_axis,
diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py
index c83f1388881..b6f2af0e230 100644
--- a/models/demos/vgg/tests/test_perf_vgg.py
+++ b/models/demos/vgg/tests/test_perf_vgg.py
@@ -137,10 +137,10 @@ def test_perf_device_bare_metal_vgg(batch_size, model_name):
     margin = 0.03
 
     if model_name == "ttnn_vgg11":
-        expected_perf = 132.2436 if is_grayskull() else 283.289
+        expected_perf = 168 if is_grayskull() else 283.289
         command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg11.py"
     else:
-        expected_perf = 116.1459 if is_grayskull() else 201.3867
+        expected_perf = 144 if is_grayskull() else 201.3867
         command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg16.py"
 
     cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]
diff --git a/models/experimental/functional_unet/tests/test_unet_perf.py b/models/experimental/functional_unet/tests/test_unet_perf.py
index 930658606ca..e28e591728e 100644
--- a/models/experimental/functional_unet/tests/test_unet_perf.py
+++ b/models/experimental/functional_unet/tests/test_unet_perf.py
@@ -34,7 +34,7 @@
 @pytest.mark.models_device_performance_bare_metal
 @pytest.mark.parametrize(
     "batch, groups, expected_device_perf_fps",
-    ((2, 1, 779.0),),
+    ((2, 1, 766.0),),
 )
 def test_unet_perf_device(batch: int, groups: int, expected_device_perf_fps: float):
     command = f"pytest models/experimental/functional_unet/tests/test_unet_model.py::test_unet_model[device_params0-{groups}-{batch}]"
diff --git a/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md b/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md
index 6625118f3d5..d92e7d5cd74 100644
--- a/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md
+++ b/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md
@@ -61,16 +61,16 @@ uint32_t dst_dram_noc_y = dst_dram_noc_coord.y;
 For this example, we will also specify the NoC coordinates to pass into the kernel functions as runtime arguments. We will use this to ensure that the kernels will access the data at the correct NoC addresses.
 
 ``` cpp
-constexpr uint32_t src0_cb_index = CB::c_in0;
+constexpr uint32_t src0_cb_index = CBIndex::c_0;
 constexpr uint32_t num_input_tiles = 1;
 CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
 CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-constexpr uint32_t src1_cb_index = CB::c_in1;
+constexpr uint32_t src1_cb_index = CBIndex::c_1;
 CircularBufferConfig cb_src1_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src1_cb_index, single_tile_size);
 CBHandle cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-constexpr uint32_t output_cb_index = CB::c_out0;
+constexpr uint32_t output_cb_index = CBIndex::c_16;
 constexpr uint32_t num_output_tiles = 1;
 CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
 CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
@@ -194,7 +194,7 @@ In the compute kernel, a single tile is read from each of the circular buffers c
 ``` cpp
 uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr);
 
-constexpr uint32_t cb_id_out0 = tt::CB::c_out0;
+constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16;
 uint32_t ublock_size_bytes = get_tile_size(cb_id_out0);
 uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
 
diff --git a/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md b/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md
index 583f03902af..52e5e556b1f 100644
--- a/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md
+++ b/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md
@@ -62,11 +62,11 @@ On the host side, we set initialize the source data. In this case, they are repr
 ## Set up circular buffers for input
 
 ``` cpp
-constexpr uint32_t src0_cb_index = CB::c_in0;
+constexpr uint32_t src0_cb_index = CBIndex::c_0;
 CircularBufferConfig cb_src0_config = CircularBufferConfig(single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
 CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-constexpr uint32_t src1_cb_index = CB::c_in1;
+constexpr uint32_t src1_cb_index = CBIndex::c_1;
 CircularBufferConfig cb_src1_config = CircularBufferConfig(single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src1_cb_index, single_tile_size);
 CBHandle cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 ```
@@ -104,8 +104,8 @@ uint64_t src0_dram_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src
 uint64_t src1_dram_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_dram);
 uint64_t dst_dram_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_dram);
 
-constexpr uint32_t cb_id_in0 = tt::CB::c_in0; // index=0
-constexpr uint32_t cb_id_in1 = tt::CB::c_in1; // index=1
+constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; // index=0
+constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; // index=1
 
 // single-tile ublocks
 uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0);
diff --git a/tech_reports/prog_examples/eltwise_binary/eltwise_binary.md b/tech_reports/prog_examples/eltwise_binary/eltwise_binary.md
index b15bbb139ba..138239f6518 100644
--- a/tech_reports/prog_examples/eltwise_binary/eltwise_binary.md
+++ b/tech_reports/prog_examples/eltwise_binary/eltwise_binary.md
@@ -20,19 +20,19 @@ In terms of DRAM buffers, We just need a new buffer for a 2nd source, because we
 We already have set the circular buffers needed for compute data communication.
 
 ``` cpp
-constexpr uint32_t src0_cb_index = CB::c_in0;
+constexpr uint32_t src0_cb_index = CBIndex::c_0;
 constexpr uint32_t src0_cb_addr = 200 * 1024;
 constexpr uint32_t num_input_tiles = 2;
 constexpr uint32_t input_cb_size = num_input_tiles * single_tile_size;
 CircularBufferConfig cb_src0_config = CircularBufferConfig(input_cb_size, {{src0_cb_index, tt::DataFormat::Float16_b}}, src0_cb_addr).set_page_size(src0_cb_index, single_tile_size);
 CBHandle cb_src0 = CreateCircularBuffer(program, core, cb_src0_config);
 
-constexpr uint32_t src1_cb_index = CB::c_in1;
+constexpr uint32_t src1_cb_index = CBIndex::c_1;
 constexpr uint32_t src1_cb_addr = 300 * 1024;
 CircularBufferConfig cb_src1_config = CircularBufferConfig(input_cb_size, {{src1_cb_index, tt::DataFormat::Float16_b}}, src1_cb_addr).set_page_size(src1_cb_index, single_tile_size);
 CBHandle cb_src1 = CreateCircularBuffer(program, core, cb_src1_config);
 
-constexpr uint32_t output_cb_index = CB::c_out0;
+constexpr uint32_t output_cb_index = CBIndex::c_16;
 constexpr uint32_t output_cb_addr = 400 * 1024;
 constexpr uint32_t num_output_tiles = 2;
 constexpr uint32_t input_cb_size = num_input_tiles * single_tile_size;
diff --git a/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md b/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md
index d4c2622e391..c7964729a86 100644
--- a/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md
+++ b/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md
@@ -17,12 +17,12 @@ To build and execute, you may use the following commands. Note that we include t
 The number of buffers we're using in DRAM will stay the same. However, we need to declare some circular buffers to enable data transfer between the reader, compute, and writer engines.
 
 ``` cpp
-constexpr uint32_t src0_cb_index = CB::c_in0;
+constexpr uint32_t src0_cb_index = CBIndex::c_0;
 constexpr uint32_t num_input_tiles = 2;
 CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
 CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-constexpr uint32_t output_cb_index = CB::c_out0;
+constexpr uint32_t output_cb_index = CBIndex::c_16;
 constexpr uint32_t num_output_tiles = 2;
 CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
 CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
diff --git a/tech_reports/prog_examples/matmul_multi_core_optimized/data_reuse.md b/tech_reports/prog_examples/matmul_multi_core_optimized/data_reuse.md
index 96526b78a0e..8295e204ae5 100644
--- a/tech_reports/prog_examples/matmul_multi_core_optimized/data_reuse.md
+++ b/tech_reports/prog_examples/matmul_multi_core_optimized/data_reuse.md
@@ -2,8 +2,8 @@
 
 ## Fine-Grained Block Size Control
 
-Advanced matrix dimension controls are found in the Programming Example's matmul_common directory, namely Block Matrix Multiply Ops (bmm_op.hpp). 
-Including this header allows us advanced dynamic means of defining and retrieving matrix parameters. 
+Advanced matrix dimension controls are found in the Programming Example's matmul_common directory, namely Block Matrix Multiply Ops (bmm_op.hpp).
+Including this header allows us advanced dynamic means of defining and retrieving matrix parameters.
 Our matmul kernels that work out-of-the-box perform on row-major and tile-major layouts, so you have the power to define your own outer-dimensional tile sizes, desired core grid dimensions, as well as your own input block width, all depending on your problem at hand.
 
 In our reuse example, we can employ the `get_large_matmul_params(...)` function and pass our inputs as described above. By doing so, we let METALIUM\'s bmm op utility functions do the heavy lifting for us mathematically, and calculate our matmul\'s exact work-per-core size and work output size seamlessly. (You can consult the header for the prime factorization method used, plus many other details).
@@ -16,7 +16,7 @@ uint32_t out_subblock_h = std::get<2(matmul_params);
 uint32_t out_subblock_w = std::get<3(matmul_params);
 ```
 
-Take note of the example\'s use of \"subblocks\" above. Recall that until now, we have optimized matmul by dividing matrices into blocks and subdivided those into tiles, which are laid out neatly on our compute cores. 
+Take note of the example\'s use of \"subblocks\" above. Recall that until now, we have optimized matmul by dividing matrices into blocks and subdivided those into tiles, which are laid out neatly on our compute cores.
 A key optimization here in [matmul_multicore_reuse] is the introduction of an intermediate subdivision of blocks, called subblocks. Below are some optimal subblock layouts already provided for you in the header, which run efficiently on our hardware.
 
 ``` cpp
@@ -34,11 +34,11 @@ constexpr std::array<std::tuple<uint32_t, uint32_t, 20SUBBLOCK_HW_CHOICES = {{
 
 ## Intermediate Circular Buffer Configuration
 
-In addition to our double-buffer config, we introduce a third circular buffer denoted as `interm0_cb_index`. Out of the 32 possible circular buffers provided by the API (which you can view in the [tt_metal/hostdevcommon/kernel_structs.h](../../../tt_metal/hostdevcommon/kernel_structs.h), this one belongs to a subset of intermediate CBs. 
+In addition to our double-buffer config, we introduce a third circular buffer denoted as `interm0_cb_index`. Out of the 32 possible circular buffers provided by the API (which you can view in the [tt_metal/hostdevcommon/kernel_structs.h](../../../tt_metal/hostdevcommon/kernel_structs.h), this one belongs to a subset of intermediate CBs.
 This buffer acts as a temporary storage for the intermediate results of matrix multiplication before they are combined into the final output.
 
 ``` cpp
-uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+uint32_t output_cb_index = CBIndex::c_16;
 uint32_t interm0_cb_index = 24; // Index for the intermediate circular buffer
 std::map<uint8_t, tt::DataFormatoutput_cb_data_format_spec {
     {output_cb_index, cb_data_format}, // Output buffer configuration
@@ -102,7 +102,7 @@ vector<uint32_tcompute_kernel_args = {
 };
 ```
 
-To properly run the reader and writer kernels, we must set up the runtime arguments with this information. For each block of in0 and in1 matrices, we read the tiles pertaining to a certain subblock from DRAM into that core\'s L1, and we perform the bmm_large_block_zm on tiles therein using stride arguments. 
+To properly run the reader and writer kernels, we must set up the runtime arguments with this information. For each block of in0 and in1 matrices, we read the tiles pertaining to a certain subblock from DRAM into that core\'s L1, and we perform the bmm_large_block_zm on tiles therein using stride arguments.
 
 Recall each tile is a member of a certain subblock, and subblocks are distributed across different cores in the core grid (specifically, in each core\'s L1). The writer kernel then stores the partial matmul results into its corresponding output subblock.
 
@@ -167,38 +167,38 @@ In `bmm_large_block_zm.cpp`,
 
 a.  **Preparing the Intermediate Buffer**:
 
-**Reserving Partial Results Space**: For a given block (excluding the last block), we reserve space for intermediate (ie. partial) results in the rear of the intermediate circular buffer with `cb_reserve_back(...)`. 
+**Reserving Partial Results Space**: For a given block (excluding the last block), we reserve space for intermediate (ie. partial) results in the rear of the intermediate circular buffer with `cb_reserve_back(...)`.
 Each consecutive subblock within this block will access this space, and contribute their partial results.
-    
+
 ```cpp
-cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
 ```
-    
+
 **Storing Partial Results**: Partial results are stored via a packing mechanism with `pack_tile(...)` into the above reserved space.
 
 ``` cpp
 for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-    pack_tile(i, tt::CB::c_intermed0);
+    pack_tile(i, tt::CBIndex::c_24);
 }
-cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
 ```
 
 b.  **Computing with Partial Results**:
 
-  **Result Retrieval**: During block computations after the first block, we retrieve the stored results `cb_wait_front(...)` for further computation. This retrieval, also known as \"reloading\" data, is the heart of our data reuse concept. 
+  **Result Retrieval**: During block computations after the first block, we retrieve the stored results `cb_wait_front(...)` for further computation. This retrieval, also known as \"reloading\" data, is the heart of our data reuse concept.
 
   It is leveraged only when our flag `enable_reload` is set to true. Recall from our understanding of circular buffers that there needs be synchronization that all tile work thus far be finished before contributing more partial results.
-    
+
 ``` cpp
 if (enable_reload) {
-    cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+    cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
     for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-        copy_tile(tt::CB::c_intermed0, i, i);
+        copy_tile(tt::CBIndex::c_24, i, i);
     }
-    cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+    cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
 }
 ```
-    
+
 -   **Execution with \`matmul_tiles\`**: Now we are ready to compute partial results and integrate them back into the computation stream (or for the last block of computation, culminate our data reuse to produce the final output tensor).
 
 We call the `matmul_tiles(...)` function to execute our matmul on the core\'s subblocks of tiles.
@@ -213,7 +213,7 @@ for (uint32_t h = 0; h < out_subblock_h; h++) {
         for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
             int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
             int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
-            matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+            matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
             in1_index_inner_dim_offset += in1_per_core_w;
         }
         dst_index++;
diff --git a/tech_reports/prog_examples/matmul_single_core/matmul_single_core.md b/tech_reports/prog_examples/matmul_single_core/matmul_single_core.md
index e8c704035b5..ecae7fc05ff 100644
--- a/tech_reports/prog_examples/matmul_single_core/matmul_single_core.md
+++ b/tech_reports/prog_examples/matmul_single_core/matmul_single_core.md
@@ -127,18 +127,18 @@ uint32_t dst_addr = dst_dram_buffer.address();
 We need to declare three circular buffers to enable data transfer between the reader, compute, and writer engines. Input tiles count is 2 because although the computation is a single tile process, we want to get a performance boost by double buffering..
 
 ``` cpp
-uint32_t src0_cb_index = CB::c_in0; //0
+uint32_t src0_cb_index = CBIndex::c_0; //0
 uint32_t num_input_tiles = 2;
 tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
     .set_page_size(src0_cb_index, single_tile_size);
 auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-uint32_t src1_cb_index = CB::c_in1; // 1
+uint32_t src1_cb_index = CBIndex::c_1; // 1
 tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
     .set_page_size(src1_cb_index, single_tile_size);
 auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+uint32_t output_cb_index = tt::CBIndex::c_16;
 uint32_t num_output_tiles = 2;
 tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
     .set_page_size(output_cb_index, single_tile_size);
diff --git a/tech_reports/prog_examples/pad_multi_core/pad_multi_core.md b/tech_reports/prog_examples/pad_multi_core/pad_multi_core.md
index 564765cf078..b09d96b8187 100644
--- a/tech_reports/prog_examples/pad_multi_core/pad_multi_core.md
+++ b/tech_reports/prog_examples/pad_multi_core/pad_multi_core.md
@@ -134,7 +134,7 @@ The DRAM buffer configuration for the output tensor is similar to that for the i
 # Configure and create CircularBuffer
 
 ``` cpp
-uint32_t cb_id = CB::c_in0;
+uint32_t cb_id = CBIndex::c_0;
 tt::DataFormat cb_data_format = tt::DataFormat::UInt32;
 CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(dst_N * packed_data_size * 2, {{cb_id, cb_data_format}})
     .set_page_size(cb_id, packed_data_size);
diff --git a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
index b9774790cab..c64a9be60b1 100644
--- a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
+++ b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
@@ -94,13 +94,13 @@ Data will be read to the circular buffers on each core through the DRAM buffer,
 
 ``` cpp
 bool src_is_dram = src_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
-uint32_t input_cb_index = CB::c_in0;
+uint32_t input_cb_index = CBIndex::c_0;
 CircularBufferConfig input_cb_config = CircularBufferConfig(shard_size * input_unit_size, {{input_cb_index, cb_data_format}})
     .set_page_size(input_cb_index, input_unit_size);
 auto cb_input = tt_metal::CreateCircularBuffer(program, cores, input_cb_config);
 ```
 
-Across each core, the `CircularBuffer` indicated by the index corresponding to `CB::c_in0` will be used to store the data. Through the `CircularBufferConfig` object, we specify the total size of the buffer, which is dependent on the shard and data size, and we also specify the page size.
+Across each core, the `CircularBuffer` indicated by the index corresponding to `CBIndex::c_0` will be used to store the data. Through the `CircularBufferConfig` object, we specify the total size of the buffer, which is dependent on the shard and data size, and we also specify the page size.
 The corresponding `CircularBuffer` objects are then allocated with this configuration across each of the designated cores.
 
 # Create data movement kernels for sharding
diff --git a/tests/scripts/run_cpp_unit_tests.sh b/tests/scripts/run_cpp_unit_tests.sh
index 7da1c173021..ff24af920f8 100755
--- a/tests/scripts/run_cpp_unit_tests.sh
+++ b/tests/scripts/run_cpp_unit_tests.sh
@@ -9,19 +9,25 @@ fi
 
 kernel_path="/tmp/kernels"
 mkdir -p $kernel_path
-TT_METAL_KERNEL_PATH=$kernel_path ./build/test/tt_metal/test_kernel_path_env_var
+TT_METAL_KERNEL_PATH=$kernel_path ./build/test/tt_metal/unit_tests_api --gtest_filter=CompileProgramWithKernelPathEnvVarFixture.*
 rm -rf $kernel_path
 
+./build/test/tt_metal/unit_tests_api
+./build/test/tt_metal/unit_tests_debug_tools
+./build/test/tt_metal/unit_tests_device
+./build/test/tt_metal/unit_tests_dispatch
+./build/test/tt_metal/unit_tests_eth
+./build/test/tt_metal/unit_tests_llk
+./build/test/tt_metal/unit_tests_stl
+
 if [[ ! -z "$TT_METAL_SLOW_DISPATCH_MODE" ]]; then
-    ./build/test/tt_metal/unit_tests
     env python tests/scripts/run_tt_metal.py --dispatch-mode slow
     env python tests/scripts/run_tt_eager.py --dispatch-mode slow
 else
-    ./build/test/tt_metal/unit_tests_fast_dispatch
-    TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue --gtest_filter=MultiCommandQueueSingleDeviceFixture.*
+    TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=MultiCommandQueue*Fixture.*
     # Enable this on BH after #14613
     if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
-        TT_METAL_GTEST_ETH_DISPATCH=1 ./build/test/tt_metal/unit_tests_fast_dispatch
+        TT_METAL_GTEST_ETH_DISPATCH=1 ./build/test/tt_metal/unit_tests_dispatch
     fi
     env python tests/scripts/run_tt_eager.py --dispatch-mode fast
     env python tests/scripts/run_tt_metal.py --dispatch-mode fast
diff --git a/tests/scripts/run_testpoint_perprocess.py b/tests/scripts/run_testpoint_perprocess.py
index 8a2feb156bb..3d2c4a88e9d 100755
--- a/tests/scripts/run_testpoint_perprocess.py
+++ b/tests/scripts/run_testpoint_perprocess.py
@@ -13,7 +13,7 @@
 
 DEBUG = False
 TT_METAL_HOME = os.environ["TT_METAL_HOME"]
-DEFAULT_GTEST = f"{TT_METAL_HOME}/build/test/tt_metal/unit_tests"
+DEFAULT_GTEST = f"{TT_METAL_HOME}/build/test/tt_metal/unit_tests_api"
 
 
 def extract_list_of_test_points(args: argparse.Namespace):
diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
index 517503b2646..6662f2f7b2c 100755
--- a/tests/scripts/run_tests.sh
+++ b/tests/scripts/run_tests.sh
@@ -76,7 +76,7 @@ run_frequent_api_pipeline_tests() {
     local dispatch_mode=$3
 
     if [[ $dispatch_mode == "slow" ]]; then
-        TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_frequent
+        TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter=DispatchStress.TensixRunManyTimes
         echo "Running Python API unit tests in SD for frequent..."
         ./tests/scripts/run_python_api_unit_tests.sh
     fi
diff --git a/tests/scripts/run_tools_tests.sh b/tests/scripts/run_tools_tests.sh
index d86be0f8c0e..7283788336c 100755
--- a/tests/scripts/run_tools_tests.sh
+++ b/tests/scripts/run_tools_tests.sh
@@ -12,7 +12,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
     echo "Running watcher dump tool tests..."
 
     # Run a test that populates basic fields but not watcher fields
-    ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*PrintHanging
+    ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*PrintHanging
 
     # Run dump tool w/ minimum data - no error expected.
     ./build/tools/watcher_dump -d=0 -w -c
@@ -22,7 +22,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
     echo "Watcher dump minimal test - Pass"
 
     # Now run with all watcher features, expect it to throw.
-    ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*WatcherAssertBrisc
+    ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherAssertBrisc
     ./build/tools/watcher_dump -d=0 -w &> tmp.log || { echo "Above failure is expected."; }
 
     # Verify the error we expect showed up in the program output.
@@ -30,7 +30,7 @@ if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
     echo "Watcher dump all data test - Pass"
 
     # Check that stack dumping is working
-    ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter=*TestWatcherRingBufferBrisc
+    ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*TestWatcherRingBufferBrisc
     ./build/tools/watcher_dump -d=0 -w
     grep "brisc highest stack usage:" generated/watcher/watcher.log > /dev/null || { echo "Error: couldn't find stack usage in watcher log after dump." ; exit 1; }
     echo "Watcher stack usage test - Pass"
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
index 6e89ceff603..6b33b853a07 100755
--- a/tests/scripts/t3000/run_t3000_unit_tests.sh
+++ b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -8,13 +8,13 @@ run_t3000_ttmetal_tests() {
 
   echo "LOG_METAL: Running run_t3000_ttmetal_tests"
 
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" ; fail+=$?
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$?
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" ; fail+=$?
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" ; fail+=$?
-  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" ; fail+=$?
-  ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" ; fail+=$?
-  ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$?
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth --gtest_filter="DeviceFixture.ActiveEthKernelsDirectSendAllConnectedChips" ; fail+=$?
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth --gtest_filter="DeviceFixture.ActiveEthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$?
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth --gtest_filter="DeviceFixture.ActiveEthKernelsDirectRingGatherAllChips" ; fail+=$?
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_eth --gtest_filter="DeviceFixture.ActiveEthKernelsInterleavedRingGatherAllChips" ; fail+=$?
+  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*" ; fail+=$?
+  ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueMultiDevice*Fixture.*" ; fail+=$?
+  ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
diff --git a/tests/scripts/tg/run_tg_unit_tests.sh b/tests/scripts/tg/run_tg_unit_tests.sh
index 52ad5748558..669f6383b3a 100755
--- a/tests/scripts/tg/run_tg_unit_tests.sh
+++ b/tests/scripts/tg/run_tg_unit_tests.sh
@@ -5,11 +5,11 @@ run_tg_tests() {
 
   echo "LOG_METAL: running run_tg_unit_tests"
 
-  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*"
+  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*"
   ./build/test/ttnn/galaxy_unit_tests_ttnn
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_galaxy --gtest_filter="GalaxyFixture.*:TGFixture.*"
-  ./build/test/tt_metal/unit_tests_galaxy --gtest_filter="GalaxyFixture.*:TGFixture.*"
-  TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue --gtest_filter="MultiCommandQueueMultiDeviceFixture.*"
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGFixture.*"
+  ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGFixture.*"
+  TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="MultiCommandQueueMultiDevice*Fixture.*"
 
 }
 
diff --git a/tests/scripts/tgg/run_tgg_unit_tests.sh b/tests/scripts/tgg/run_tgg_unit_tests.sh
index 08f8f08c421..0eb73d5e823 100755
--- a/tests/scripts/tgg/run_tgg_unit_tests.sh
+++ b/tests/scripts/tgg/run_tgg_unit_tests.sh
@@ -5,10 +5,10 @@ run_tgg_tests() {
 
   echo "LOG_METAL: running run_tgg_unit_tests"
 
-  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*"
+  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_dispatch --gtest_filter="CommandQueueSingleCard*Fixture.*"
   ./build/test/ttnn/galaxy_unit_tests_ttnn
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_galaxy --gtest_filter="GalaxyFixture.*:TGGFixture.*"
-  ./build/test/tt_metal/unit_tests_galaxy --gtest_filter="GalaxyFixture.*:TGGFixture.*"
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGGFixture.*"
+  ./build/test/tt_metal/unit_tests_device --gtest_filter="GalaxyFixture.*:TGGFixture.*"
   pytest -s tests/ttnn/distributed/test_mesh_device_TGG.py
 }
 
diff --git a/tests/sweep_framework/sweeps/data_movement/expand/expand_pytorch2.py b/tests/sweep_framework/sweeps/data_movement/expand/expand_pytorch2.py
index 22837badf2f..9eef68842af 100644
--- a/tests/sweep_framework/sweeps/data_movement/expand/expand_pytorch2.py
+++ b/tests/sweep_framework/sweeps/data_movement/expand/expand_pytorch2.py
@@ -309,4 +309,18 @@ def run(
     *,
     device,
 ):
-    raise Exception("Expand is not supported, TODO: implement via recursive concat with itself")
+    torch_tensor = torch_random(expand_specs["shape"], -10, 10, dtype=torch.bfloat16)
+    expanded_tensor = torch_tensor.expand(expand_specs["size"])
+
+    ttnn_tensor = ttnn.from_torch(torch_tensor, device=device, layout=layout, dtype=dtype)
+
+    start_time = start_measuring_time()
+    expanded_ttnn_tensor = ttnn.expand(ttnn_tensor, expand_specs["size"])
+    e2e_perf = stop_measuring_time(start_time)
+
+    ttnn_output_tensor = ttnn.to_torch(expanded_ttnn_tensor)
+
+    result = check_with_pcc(expanded_tensor, ttnn_output_tensor, 0.999)
+
+    return [result, e2e_perf]
+    # raise Exception("Expand is not supported, TODO: implement via recursive concat with itself")
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/exp/exp.py b/tests/sweep_framework/sweeps/eltwise/unary/exp/exp.py
index ac6dbc0b8c2..93e560e6124 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/exp/exp.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/exp/exp.py
@@ -27,20 +27,13 @@
     "nightly": {
         "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 16)
         + gen_shapes([1, 32, 32], [12, 256, 256], [1, 32, 32], 16)
-        + gen_shapes([32, 32], [256, 256], [32, 32], 32),
+        + gen_shapes([32, 32], [256, 256], [32, 32], 32)
+        + gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 32),
         "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
         "input_a_layout": [ttnn.TILE_LAYOUT],
         "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
         "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
-        "use_safe_nums": [True],
-    },
-    "xfail": {
-        "input_shape": gen_shapes([1, 1, 32, 32], [6, 12, 256, 256], [1, 1, 32, 32], 32),
-        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
-        "input_a_layout": [ttnn.TILE_LAYOUT],
-        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
-        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
-        "use_safe_nums": [False],
+        "use_safe_nums": [True, False],
     },
 }
 
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/hardsigmoid/hardsigmoid.py b/tests/sweep_framework/sweeps/eltwise/unary/hardsigmoid/hardsigmoid.py
index f69b2636bf2..ca8e8d915b7 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/hardsigmoid/hardsigmoid.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/hardsigmoid/hardsigmoid.py
@@ -35,8 +35,8 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT or test_vector["input_a_dtype"] == ttnn.bfloat8_b:
-        return True, "Row Major layout and bfloat8_b are not supported"
+    if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+        return True, "Row Major layout is not supported"
     return False, None
 
 
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/hardswish/hardswish.py b/tests/sweep_framework/sweeps/eltwise/unary/hardswish/hardswish.py
index ee4b0218499..14e6f86afbc 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/hardswish/hardswish.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/hardswish/hardswish.py
@@ -35,8 +35,8 @@
 # If invalidated, the vector will still be stored but will be skipped.
 # Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
 def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
-    if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT or test_vector["input_a_dtype"] == ttnn.bfloat8_b:
-        return True, "Row Major layout and bfloat8_b are not supported"
+    if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+        return True, "Row Major layout is not supported"
     return False, None
 
 
diff --git a/tests/tt_eager/ops/kernel/eltwise_sfpu.cpp b/tests/tt_eager/ops/kernel/eltwise_sfpu.cpp
index d65c4bdf818..9db91978156 100644
--- a/tests/tt_eager/ops/kernel/eltwise_sfpu.cpp
+++ b/tests/tt_eager/ops/kernel/eltwise_sfpu.cpp
@@ -16,29 +16,29 @@ void MAIN {
 
 
 
-           init_sfpu(tt::CB::c_in0);
+           init_sfpu(tt::CBIndex::c_0, tt::CBIndex::c_16);
            uint32_t block_index = 0;
-           cb_reserve_back(tt::CB::c_out0, per_core_block_dim);
+           cb_reserve_back(tt::CBIndex::c_16, per_core_block_dim);
            uint32_t tile_index = 0;
            acquire_dst();
 
            // Pop tile after tile, copy to DST and pack
-           cb_wait_front(tt::CB::c_in0, 1);
+           cb_wait_front(tt::CBIndex::c_0, 1);
 
-           copy_tile(tt::CB::c_in0, 0, 0);
+           copy_tile(tt::CBIndex::c_0, 0, 0);
 
            for(uint32_t i=0; i < tile_factor; i++) {
 #ifdef SFPU_OP_CHAIN_0
            SFPU_OP_CHAIN_0
 #endif
            }
-           pack_tile(0, tt::CB::c_out0);
+           pack_tile(0, tt::CBIndex::c_16);
 
-           cb_pop_front(tt::CB::c_in0, 1);
+           cb_pop_front(tt::CBIndex::c_0, 1);
 
            release_dst();
 
-           cb_push_back(tt::CB::c_out0, per_core_block_dim);
+           cb_push_back(tt::CBIndex::c_16, per_core_block_dim);
 
 }
 }
diff --git a/tests/tt_eager/ops/test_average_pool.cpp b/tests/tt_eager/ops/test_average_pool.cpp
index 969f4941d50..1f0f8f11b30 100644
--- a/tests/tt_eager/ops/test_average_pool.cpp
+++ b/tests/tt_eager/ops/test_average_pool.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/operations/pool/avgpool/avg_pool.hpp"
+#include "ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 #include "ttnn/operations/numpy/functions.hpp"
 
@@ -23,7 +23,7 @@ Tensor run_avg_pool_2d_resnet(tt::tt_metal::LegacyShape& tensor_shape, Device* d
     if (!AutoFormat::check_input_tensor_format(input_tensor, padded_input_shape)) {
         padded_input_tensor = AutoFormat::format_input_tensor(input_tensor, device, padded_input_shape, 0, Layout::TILE);    // pad with 0s
     }
-    auto device_output = avg_pool2d(padded_input_tensor);
+    auto device_output = global_avg_pool2d(padded_input_tensor);
     return device_output.cpu();
 };
 
diff --git a/tests/tt_eager/ops/test_sfpu.cpp b/tests/tt_eager/ops/test_sfpu.cpp
index 2773327468e..ee0408f8d98 100644
--- a/tests/tt_eager/ops/test_sfpu.cpp
+++ b/tests/tt_eager/ops/test_sfpu.cpp
@@ -104,7 +104,7 @@ bool run_sfpu_test(string sfpu_name,int tile_factor=1,bool use_DRAM=true) {
 
         // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input CB
         // CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math kernel, input CB and reader
-        uint32_t src0_cb_index = 0;
+        uint32_t src0_cb_index = tt::CBIndex::c_0;
         uint32_t num_input_tiles = 8;
         tt_metal::CircularBufferConfig src_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(src0_cb_index, single_tile_size);
@@ -112,7 +112,7 @@ bool run_sfpu_test(string sfpu_name,int tile_factor=1,bool use_DRAM=true) {
 
         // no need for c_in2 buffer since scaler=0 in the reader kernel
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 1;
         tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_transpose.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_transpose.py
index 95818476b8c..4f679c70880 100644
--- a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_transpose.py
+++ b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_transpose.py
@@ -14,6 +14,7 @@
 shape_wh = [
     [[1, 1, 32, 32]],  # Single core
     [[3, 1, 320, 384]],  # Multi core
+    [[1, 1024, 5, 1280]],  # Non page-aligned
 ]
 
 
diff --git a/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py b/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
index 976d988cbaa..7784996ce58 100644
--- a/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
+++ b/tests/tt_eager/python_api_testing/trace_testing/misc/test_average_pool.py
@@ -64,7 +64,7 @@ def test_run_average_pool(act_shape, dtype, device, use_program_cache, enable_as
     ttact_res = ttact.to(device)
 
     def run_ops(ttact_res):
-        return ttnn.avg_pool2d(ttact_res)
+        return ttnn.global_avg_pool2d(ttact_res)
 
     # Compile
     run_ops(ttact_res)
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_average_pool.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_average_pool.py
index 924dc16e04c..44a2de6d048 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_average_pool.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_average_pool.py
@@ -44,7 +44,7 @@ def test_run_average_pool(act_shape, dtype, device):
         ttact = ttact.pad_to_tile(0.0)
     ttact = ttact.to(device)
 
-    out = ttnn.avg_pool2d(ttact)
+    out = ttnn.global_avg_pool2d(ttact)
 
     out = out.cpu().to(ttnn.ROW_MAJOR_LAYOUT)
     out_shape = [batch_size, 1, 1, channels]
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor.py
deleted file mode 100644
index f7b7ab08fa9..00000000000
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import os
-import pathlib
-
-import torch
-import numpy as np
-
-import ttnn
-
-tt_dtype_to_torch_dtype = {
-    ttnn.uint16: torch.int16,
-    ttnn.uint32: torch.int32,
-    ttnn.float32: torch.float,
-    ttnn.bfloat16: torch.bfloat16,
-    ttnn.bfloat8_b: torch.float,
-    ttnn.bfloat4_b: torch.float,
-}
-
-
-@pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
-@pytest.mark.parametrize(
-    "tt_dtype",
-    [
-        ttnn.uint32,
-        ttnn.uint16,
-        ttnn.float32,
-        ttnn.bfloat16,
-        ttnn.bfloat8_b,
-        ttnn.bfloat4_b,
-    ],
-)
-def test_tensor_conversion_between_torch_and_tt(shape, tt_dtype, device):
-    torch.manual_seed(0)
-
-    dtype = tt_dtype_to_torch_dtype[tt_dtype]
-
-    if dtype in {torch.int16, torch.int32}:
-        torch_tensor = torch.randint(torch.iinfo(dtype).min, torch.iinfo(dtype).max, shape, dtype=dtype)
-    else:
-        torch_tensor = torch.rand(shape, dtype=dtype)
-
-    tt_tensor = ttnn.Tensor(torch_tensor, tt_dtype)
-    if tt_dtype in {
-        ttnn.bfloat16,
-        ttnn.float32,
-        ttnn.uint32,
-        ttnn.uint16,
-    }:
-        assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED
-    else:
-        assert tt_tensor.storage_type() == ttnn.StorageType.OWNED
-
-    if tt_dtype in {ttnn.bfloat8_b, ttnn.bfloat4_b}:
-        tt_tensor = tt_tensor.to(ttnn.TILE_LAYOUT)
-
-    if tt_dtype in {
-        ttnn.float32,
-        ttnn.bfloat16,
-        ttnn.bfloat8_b,
-        ttnn.bfloat4_b,
-        ttnn.uint32,
-        ttnn.uint16,
-    }:
-        tt_tensor = tt_tensor.to(device)
-        tt_tensor = tt_tensor.cpu()
-
-    if tt_dtype in {
-        ttnn.bfloat8_b,
-        ttnn.bfloat4_b,
-    }:
-        tt_tensor = tt_tensor.to(ttnn.ROW_MAJOR_LAYOUT)
-
-    torch_tensor_after_round_trip = tt_tensor.to_torch()
-
-    assert torch_tensor.dtype == torch_tensor_after_round_trip.dtype
-    assert torch_tensor.shape == torch_tensor_after_round_trip.shape
-
-    allclose_kwargs = {}
-    if tt_dtype == ttnn.bfloat8_b:
-        allclose_kwargs = dict(atol=1e-2)
-    elif tt_dtype == ttnn.bfloat4_b:
-        allclose_kwargs = dict(atol=0.2)
-
-    passing = torch.allclose(torch_tensor, torch_tensor_after_round_trip, **allclose_kwargs)
-    assert passing
-
-
-tt_dtype_to_np_dtype = {
-    ttnn.uint16: np.int16,
-    ttnn.uint32: np.int32,
-    ttnn.float32: np.float32,
-    ttnn.bfloat16: np.float32,
-    ttnn.bfloat8_b: np.float32,
-}
-
-
-@pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
-@pytest.mark.parametrize(
-    "tt_dtype",
-    [
-        ttnn.uint32,
-        ttnn.uint16,
-        ttnn.float32,
-        # ttnn.bfloat16,
-    ],
-)
-def test_tensor_conversion_between_torch_and_np(shape, tt_dtype, device):
-    dtype = tt_dtype_to_np_dtype[tt_dtype]
-
-    if dtype in {np.int16, np.int32}:
-        np_tensor = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max, shape, dtype=dtype)
-    else:
-        np_tensor = np.random.random(shape).astype(dtype=dtype)
-
-    tt_tensor = ttnn.Tensor(np_tensor, tt_dtype)
-    if tt_dtype in {ttnn.float32, ttnn.uint32, ttnn.uint16}:
-        assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED
-
-    if tt_dtype in {
-        ttnn.float32,
-        ttnn.bfloat16,
-        ttnn.bfloat8_b,
-        ttnn.uint32,
-        ttnn.uint16,
-    }:
-        tt_tensor = tt_tensor.to(device)
-        tt_tensor = tt_tensor.cpu()
-
-    np_tensor_after_round_trip = tt_tensor.to_numpy()
-
-    assert np_tensor.dtype == np_tensor_after_round_trip.dtype
-    assert np_tensor.shape == np_tensor_after_round_trip.shape
-
-    passing = np.allclose(np_tensor, np_tensor_after_round_trip)
-    assert passing
-
-
-@pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
-@pytest.mark.parametrize(
-    "tt_dtype",
-    [
-        ttnn.uint16,
-        ttnn.uint32,
-        ttnn.float32,
-        ttnn.bfloat16,
-        ttnn.bfloat8_b,
-        ttnn.bfloat4_b,
-    ],
-)
-def test_serialization(tmp_path, shape, tt_dtype):
-    torch.manual_seed(0)
-
-    dtype = tt_dtype_to_torch_dtype[tt_dtype]
-
-    if dtype in {torch.int16, torch.int32}:
-        torch_tensor = torch.randint(0, 1024, shape, dtype=dtype)
-    else:
-        torch_tensor = torch.rand(shape, dtype=dtype)
-
-    tt_tensor = ttnn.Tensor(torch_tensor, tt_dtype)
-
-    file_name = tmp_path / pathlib.Path("tensor.bin")
-    ttnn.dump_tensor(str(file_name), tt_tensor)
-    torch_tensor_from_file = ttnn.load_tensor(str(file_name)).to_torch()
-
-    assert torch_tensor.dtype == torch_tensor_from_file.dtype
-    assert torch_tensor.shape == torch_tensor_from_file.shape
-
-    allclose_kwargs = {}
-    if tt_dtype == ttnn.bfloat8_b:
-        allclose_kwargs = dict(atol=1e-2)
-    elif tt_dtype == ttnn.bfloat4_b:
-        allclose_kwargs = dict(atol=0.2)
-
-    passing = torch.allclose(torch_tensor, torch_tensor_from_file, **allclose_kwargs)
-    assert passing
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
index f980fd1d4e4..489b25ba5e9 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -9,7 +9,7 @@
 import ttnn
 
 from loguru import logger
-from models.utility_functions import is_grayskull, is_blackhole
+from models.utility_functions import is_grayskull, is_blackhole, torch_random
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal
 from models.utility_functions import skip_for_grayskull, skip_for_blackhole
 from tests.ttnn.utils_for_testing import assert_with_pcc
@@ -644,7 +644,7 @@ def test_transpose_bfloat8_b(device, shape, swap_dims):
 )
 @pytest.mark.parametrize(
     "shape",
-    [(1, 32, 12, 100), (1, 12, 32, 100), (1, 35, 7, 7), (1, 1, 1, 1)],
+    [(1, 32, 12, 100), (1, 12, 32, 100), (1, 35, 7, 7), (1, 1, 1, 1), (1, 12, 32, 100)],
 )
 def test_transpose_hc(dtype, shape, device):
     if is_grayskull() and dtype == ttnn.float32:
@@ -691,7 +691,7 @@ def test_transpose_2D(dtype, shape, layout, device):
 )
 @pytest.mark.parametrize(
     "shape",
-    [[32, 1, 32], [32, 1, 12], [1, 1, 35], [1, 16, 32], [2, 34, 8]],
+    [[32, 1, 32], [32, 1, 12], [1, 1, 35], [1, 16, 32], [2, 34, 8], (32, 12, 100), (6, 33, 34)],
 )
 @pytest.mark.parametrize(
     "layout",
@@ -699,7 +699,14 @@ def test_transpose_2D(dtype, shape, layout, device):
 )
 @pytest.mark.parametrize(
     "dims",
-    [[0, 1], [0, 2], [2, 1], [-3, -2], [-3, -1], [-2, -1]],
+    [
+        [0, 1],
+        [0, 2],
+        [2, 1],
+        [-3, -2],
+        [-3, -1],
+        [-2, -1],
+    ],
 )
 def test_transpose_3D(dtype, shape, layout, dims, device):
     torch.manual_seed(2005)
@@ -750,14 +757,14 @@ def test_transpose_4d_wh_tile(shape, device):
 @pytest.mark.parametrize(
     "config",
     [
-        [[64, 4, 49, 32], [-2, -1], ttnn.ROW_MAJOR_LAYOUT],  # Page size must be divisible by sizeof(uint32_t)
         [[1, 1370, 1, 3, 1280], [0, -2], ttnn.TILE_LAYOUT],  # untilize doesn't work with 4D
-        [[12, 3], [0, 1], ttnn.ROW_MAJOR_LAYOUT],  # need tensor for this one
+        [[1, 50, 1, 3, 768], [0, -2], ttnn.TILE_LAYOUT],  # untilize doesn't work with 4D
+        [[21843, 768], [0, 1], ttnn.ROW_MAJOR_LAYOUT],  # circular buffer overflow
     ],
 )
 @pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
 def test_transpose_failures(config, memory_config, device):
-    pytest.skip("Failures to fix after #13217 and #13005 are in - 5D, HC PCC issue and unaligned RM tensor")
+    pytest.skip("Failing pytorch 2.0 trace sweeps")
     torch.manual_seed(2005)
     torch_input = torch.randn(config[0], dtype=torch.bfloat16)
     torch_output = torch_input.transpose(config[1][0], config[1][1])
@@ -793,6 +800,8 @@ def test_transpose_failures(config, memory_config, device):
         [[1, 9, 8, 14], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # unaligned RM that fallsback to tiled
         [[1, 9, 8, 2], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # unaligned RM that fallsback to tiled
         [[1, 2, 8, 2], [1, 2], ttnn.ROW_MAJOR_LAYOUT],  # unaligned RM that fallsback to tiled
+        [[64, 4, 49, 32], [-2, -1], ttnn.ROW_MAJOR_LAYOUT],  # Page size must be divisible by sizeof(uint32_t)
+        [[12, 3], [0, 1], ttnn.ROW_MAJOR_LAYOUT],  # need tensor for this one
         [
             [1, 8, 4096, 40],
             [1, 2],
@@ -943,3 +952,62 @@ def test_transpose_unpadded(shape, dims, layout, dtype, pad_value, device):
         assert ttnn.to_torch(a) == float("-inf")
     tt_output = ttnn.to_torch(tt_output)
     assert_with_pcc(torch_output, tt_output, 0.9999)
+
+
+@pytest.mark.parametrize("b", [1])
+@pytest.mark.parametrize("h", [18])
+@pytest.mark.parametrize("w", [65])
+@pytest.mark.parametrize("dim0", [1])
+@pytest.mark.parametrize("dim1", [2])
+def test_transpose_forge_llama(device, b, h, w, dim0, dim1):
+    torch.manual_seed(2005)
+
+    torch_input_tensor = torch_random((b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
+    torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)
+
+    input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+    input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
+    output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+    output_tensor = ttnn.from_device(output_tensor)
+    output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    assert_with_pcc(torch_output_tensor, output_tensor)
+
+
+@pytest.mark.parametrize("b", [1])
+@pytest.mark.parametrize("h", [2])
+@pytest.mark.parametrize("w", [3])
+@pytest.mark.parametrize("dim0", [-1])
+@pytest.mark.parametrize("dim1", [-2])
+def test_transpose_forge_basic(device, b, h, w, dim0, dim1):
+    torch.manual_seed(2005)
+    torch_input_tensor = torch_random((1, b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
+    torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)
+    input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+    input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
+    output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+    output_tensor = ttnn.from_device(output_tensor)
+    output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    assert_with_pcc(torch_output_tensor, output_tensor)
+
+
+@pytest.mark.parametrize("b", [6])
+@pytest.mark.parametrize("h", [33])
+@pytest.mark.parametrize("w", [34])
+@pytest.mark.parametrize("dim0", [1])
+@pytest.mark.parametrize("dim1", [0])
+def test_transpose_forge_hc(device, b, h, w, dim0, dim1):
+    torch.manual_seed(2005)
+    torch_input_tensor = torch_random((1, b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
+    torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)
+    input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+    input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
+    output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
+    output_tensor = ttnn.from_device(output_tensor)
+    output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    assert_with_pcc(torch_output_tensor, output_tensor)
diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
index 0418df6b535..95a47a7f382 100644
--- a/tests/tt_eager/tensors/test_async_tensor_apis.cpp
+++ b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
@@ -14,7 +14,7 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/tensor/types.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
 #include "tt_metal/host_api.hpp"
 #include "ttnn/operations/numpy/functions.hpp"
 
@@ -37,7 +37,7 @@ uint32_t get_device_buffer_address(const Tensor& tensor) {
 }
 }
 
-TEST_F(CommonFixture, TestTensorOwnershipSanity) {
+TEST_F(DispatchFixture, TestTensorOwnershipSanity) {
     // Sanity test tensor read, write and update paths with synchronous
     // Ensure that tensor data is copied and owned as expected
     Device* device = this->devices_[0];
@@ -112,7 +112,7 @@ TEST_F(CommonFixture, TestTensorOwnershipSanity) {
     EXPECT_EQ(readback_tensor.get_shape(), ttnn::Shape(tt::tt_metal::LegacyShape({1, 1, 32, 128})));
 }
 
-TEST_F(CommonFixture, TestAsyncEltwiseBinary) {
+TEST_F(DispatchFixture, TestAsyncEltwiseBinary) {
     Device* device = this->devices_[0];
     device->enable_async(true);
     // Populate these in first loop and verify that deallocation worked - addresses should be identical across loops
@@ -169,7 +169,7 @@ TEST_F(CommonFixture, TestAsyncEltwiseBinary) {
 
 Tensor tensor_identity_copy_function(const Tensor& tensor) { return tensor; }
 
-TEST_F(CommonFixture, TestAsyncRefCountManager) {
+TEST_F(DispatchFixture, TestAsyncRefCountManager) {
     Device* device = this->devices_[0];
     device->enable_async(true);
 
@@ -226,7 +226,7 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
     device->enable_async(false);
 }
 
-TEST_F(CommonFixture, TestTensorAsyncDataMovement) {
+TEST_F(DispatchFixture, TestTensorAsyncDataMovement) {
     // Test 2 data paths here (resembles async mode):
     // 1. Main -> Worker: Create a tensor in the main thread. Ensure that it is accessible in the worker thread even
     // after its destroyed
diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt
index 936c98f4f2d..54ede4a02c5 100644
--- a/tests/tt_metal/tt_metal/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/CMakeLists.txt
@@ -35,9 +35,7 @@ set(TT_METAL_TESTS_SRCS
     test_core_range_set.cpp
     test_compile_sets_kernel_binaries.cpp
     test_compile_program.cpp
-    test_kernel_path_env_var.cpp
     test_clean_init.cpp
-    test_create_kernel_from_string.cpp
 )
 
 foreach(TEST_SRC ${TT_METAL_TESTS_SRCS})
@@ -62,21 +60,27 @@ foreach(TEST_SRC ${TT_METAL_TESTS_SRCS})
     list(APPEND METAL_TEST_TARGETS ${TEST})
 endforeach()
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests_common)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests_fast_dispatch)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests_fast_dispatch_single_chip_multi_queue)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unit_tests_frequent)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/api)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/debug_tools)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/device)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/eth)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/integration)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/llk)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/perf_microbenchmark)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/stl)
 
 add_custom_target(
     metal_tests
     DEPENDS
         ${METAL_TEST_TARGETS}
-        unit_tests
-        unit_tests_fast_dispatch
-        unit_tests_fast_dispatch_single_chip_multi_queue
-        unit_tests_frequent
         metal_perf_microbenchmark_tests
-        unit_tests_galaxy
+        unit_tests_api
+        unit_tests_debug_tools
+        unit_tests_device
+        unit_tests_dispatch
+        unit_tests_eth
+        unit_tests_integration
+        unit_tests_llk
+        unit_tests_stl
 )
diff --git a/tests/tt_metal/tt_metal/README.md b/tests/tt_metal/tt_metal/README.md
new file mode 100644
index 00000000000..00336f8653b
--- /dev/null
+++ b/tests/tt_metal/tt_metal/README.md
@@ -0,0 +1,101 @@
+In order to keep our test suite clean, organized and searchable, please follow the guidelines provided below when adding new tests, modifying existing tests or deleting outdated tests.
+
+<!-- toc -->
+
+Table of Contents
+=================
+
+- [Table of Contents](#table-of-contents)
+  - [Test Naming](#test-naming)
+  - [Test Organization](#test-organization)
+  - [Fixture Naming](#fixture-naming)
+  - [Fixture Organization](#fixture-organization)
+  - [File Naming](#file-naming)
+  - [File Organization](#fixture-organization)
+    - [api/](#api)
+    - [debug_tools/](#debug_tools)
+    - [device/](#device)
+    - [dispatch/](#dispatch)
+    - [eth/](#eth)
+    - [integration/](#integration)
+    - [llk/](#llk)
+    - [stl/](#stl)
+    - [test_kernels/](#test_kernels)
+    - [common/](#common)
+
+<!-- Created by https://luciopaiva.com/markdown-toc/ -->
+
+<!-- tocstop -->
+
+## Test Naming
+Prefix test names with the core type(s) that the test is using:
+ - If it's using Tensix cores, prefix it with `Tensix`
+ - If it's using active ethernet cores, prefix it with `ActiveEth`
+ - If it's using idle ethernet cores, prefix it with `IdleEth`
+ - If it's using both active and idle ethernet cores, prefix it with `Eth`
+ - If it's using multiple core types, prefix it with each core type, eg. `TensixActiveEth`, `TensixIdleEth`, `TensixEth`, etc.
+ - If it isn't using any core type, don't prefix it with anything
+
+## Test Organization
+Every test should belong to either a test suite or a test fixture. Use the TEST macro for tests in test suites and the TEST_F or TEST_P macros for tests in test fixtures.
+
+Test suites are ideal for grouping related tests that don’t require shared code. Test fixtures are better suited for related tests that need shared code, which can be defined in the fixture.
+
+Keep related tests grouped together to make it easier to understand the overall test coverage.
+
+## Fixture Naming
+All fixture names should end in `Fixture`.
+
+## Fixture Organization
+Before creating a new fixture, check if an existing fixture meets your needs. If you need to create a new fixture, consider subclassing an existing fixture to avoid duplicating functionality already provided by another fixture.
+
+## File Naming
+File names should include specific prefixes or suffixes based on their content:
+ - Files that contain fixtures should have their names end with `_fixture`
+ - Files that contain helper functions and/or test utilities should have their names end with `_test_utils`
+ - Files that contain tests should have their names start with `test_`
+
+## File Organization
+Place test utility files and fixture files as close as possible to the files that rely on them. For example, if you have a test file `test_A.cpp` in `tests/tt_metal/tt_metal/dispatch/dispatch_buffer/` and another test file `test_B.cpp` in `tests/tt_metal/tt_metal/dispatch/dispatch_program/`, and both need to use a fixture file `C_fixture.hpp`, it is logical to place `C_fixture.hpp` in `tests/tt_metal/tt_metal/dispatch/`. This ensures the fixture is easily accessible to the relevant test files while avoiding unnecessary clutter in a more generic directory like `tests/tt_metal/tt_metal/common/`.
+
+Tests using Google Test should be placed in one of the directories listed below that best aligns with their purpose. If multiple directories seem suitable, use your best judgment to select the most appropriate one.
+
+__Important note: only tests that use Google Test should be placed in the following directories.__
+
+### `api/`
+ - Contains tests that explicitly test `tt-metal`'s API
+ - Contains tests that read from and/or write to the device
+
+### `debug_tools/`
+ - Contains tests for DPrint and Watcher
+
+### `device/`
+ - Contains tests for device initialization and teardown
+ - Contains tests that check device-specific properties
+
+### `dispatch/`
+ - Contains tests that explicitly test for properties relating to dispatch
+ - Contains both slow dispatch and fast dispatch tests
+
+### `eth/`
+ - Contains tests that check ethernet communication between multiple devices
+ - Contains tests that explicitly test ethernet properties on a single device
+
+### `integration/`
+ - Contains tests for real-world use cases, eg. matmul, etc
+
+### `llk/`
+ - Contains tests for compute Low-Level Kernel (LLK) API
+ - Tests don't cover individual compute LLK calls, but cover testing LLK API calls as these are used in compute kernels
+
+### `stl/`
+ - Contains tests which test custom data structures and algorithms used in `tt-metal`
+ - None of the tests in this directory should run on the device
+
+The following directories should be reserved for files that support testing but should not contain actual tests themselves.
+
+### `test_kernels/`
+ - Contains kernels that are used in tests
+
+### `common/`
+ - Contains test fixtures and utilities shared across multiple directories listed above
diff --git a/tests/tt_metal/tt_metal/api/CMakeLists.txt b/tests/tt_metal/tt_metal/api/CMakeLists.txt
new file mode 100644
index 00000000000..fc95afeb92e
--- /dev/null
+++ b/tests/tt_metal/tt_metal/api/CMakeLists.txt
@@ -0,0 +1,60 @@
+set(UNIT_TESTS_API_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_free_list_allocator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_l1_banking_allocator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_allocation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_creation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_non_blocking.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_adjacent.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_contains.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_intersects.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_iterator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_merge.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_construct.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_contains.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_intersects.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_merge.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_banked.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_bit_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_CommandQueue.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_direct.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_dram_to_l1_multicast.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_dram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_global_semaphores.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_kernel_creation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_noc.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_runtime_args.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_semaphores.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_sharded_l1_buffer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_simple_dram_buffer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_simple_l1_buffer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_soc_descriptor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_tilize_untilize.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_worker_config_buffer.cpp
+)
+
+add_executable(unit_tests_api ${UNIT_TESTS_API_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_api)
+
+target_link_libraries(
+    unit_tests_api
+    PRIVATE
+        test_metal_common_libs
+        Boost::smart_ptr
+)
+target_include_directories(
+    unit_tests_api
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+set_target_properties(
+    unit_tests_api
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
diff --git a/tests/tt_metal/tt_metal/unit_tests/allocator/test_free_list_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/allocator/test_free_list_allocator.cpp
rename to tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp
index d7b5ffdf52f..e16965f0d31 100644
--- a/tests/tt_metal/tt_metal/unit_tests/allocator/test_free_list_allocator.cpp
+++ b/tests/tt_metal/tt_metal/api/allocator/test_free_list_allocator.cpp
@@ -4,13 +4,12 @@
 
 #include <gtest/gtest.h>
 
-#include "basic_fixture.hpp"
-#include "tt_metal/host_api.hpp"
+#include "host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/impl/allocator/algorithms/free_list.hpp"
 
 // TODO: Add a variant with randomized allocations and deallocations
-TEST_F(BasicFixture, TestDirectedSeriesOfAllocDealloc) {
+TEST(FreeListAllocator, TestDirectedSeriesOfAllocDealloc) {
     constexpr uint32_t max_size_bytes = 1024;
     constexpr uint32_t min_allocation_size_bytes = 32;
     constexpr uint32_t alignment = 32;
@@ -132,7 +131,7 @@ TEST_F(BasicFixture, TestDirectedSeriesOfAllocDealloc) {
     EXPECT_EQ(addr_20.value(), 64);
 }
 
-TEST_F(BasicFixture, TestResizeAllocator) {
+TEST(FreeListAllocator, TestResizeAllocator) {
     constexpr uint32_t max_size_bytes = 1024;
     constexpr uint32_t min_allocation_size_bytes = 32;
     constexpr uint32_t alignment = 32;
@@ -184,7 +183,7 @@ TEST_F(BasicFixture, TestResizeAllocator) {
     EXPECT_EQ(addr_6.value(), 32);
 }
 
-TEST_F(BasicFixture, TestDirectedResizeAllocator) {
+TEST(FreeListAllocator, TestDirectedResizeAllocator) {
     constexpr uint32_t max_size_bytes = 1024;
     constexpr uint32_t min_allocation_size_bytes = 32;
     constexpr uint32_t alignment = 32;
diff --git a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
similarity index 92%
rename from tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp
rename to tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
index 27134acd303..738e79c3fd4 100644
--- a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp
+++ b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
@@ -4,9 +4,7 @@
 
 #include <gtest/gtest.h>
 
-#include "basic_fixture.hpp"
 #include "device_fixture.hpp"
-#include "tt_metal/common/core_descriptor.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -26,7 +24,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::Device *device) {
 
 }   // namespace unit_tests::test_l1_banking_allocator
 
-TEST_F(DeviceSingleCardFixture, TestL1BuffersAllocatedTopDown) {
+TEST_F(DeviceSingleCardBufferFixture, TestL1BuffersAllocatedTopDown) {
 
     std::vector<uint32_t> alloc_sizes = {32 * 1024, 64 * 1024, 128 * 1024};
     size_t total_size_bytes = 0;
@@ -50,7 +48,7 @@ TEST_F(DeviceSingleCardFixture, TestL1BuffersAllocatedTopDown) {
     buffers.clear();
 }
 
-TEST_F(DeviceSingleCardFixture, TestL1BuffersDoNotGrowBeyondBankSize) {
+TEST_F(DeviceSingleCardBufferFixture, TestL1BuffersDoNotGrowBeyondBankSize) {
     uint64_t alloc_limit = unit_tests::test_l1_banking_allocator::get_alloc_limit(this->device_);
 
     tt::tt_metal::InterleavedBufferConfig l1_config{
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.cpp b/tests/tt_metal/tt_metal/api/buffer_test_utils.hpp
similarity index 76%
rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.cpp
rename to tests/tt_metal/tt_metal/api/buffer_test_utils.hpp
index 0ffef3f73dc..a883090e09d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.cpp
+++ b/tests/tt_metal/tt_metal/api/buffer_test_utils.hpp
@@ -2,24 +2,25 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "test_buffer_utils.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
+#pragma once
+
+#include "host_api.hpp"
 
 namespace tt::test::buffer::detail {
-void writeL1Backdoor(tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, std::vector<uint32_t>& data) {
+inline void writeL1Backdoor(tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, std::vector<uint32_t>& data) {
     tt::log_info("{} -- coord={} address={}", __FUNCTION__, coord.str(), address);
     tt_metal::detail::WriteToDeviceL1(device, coord, address, data);
 }
-void readL1Backdoor(
+inline void readL1Backdoor(
     tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, uint32_t byte_size, std::vector<uint32_t>& data) {
     tt::log_info("{} -- coord={} address={} byte_size={}", __FUNCTION__, coord.str(), address, byte_size);
     tt_metal::detail::ReadFromDeviceL1(device, coord, address, byte_size, data);
 }
-void writeDramBackdoor(tt::tt_metal::Device* device, uint32_t channel, uint32_t address, std::vector<uint32_t>& data) {
+inline void writeDramBackdoor(tt::tt_metal::Device* device, uint32_t channel, uint32_t address, std::vector<uint32_t>& data) {
     tt::log_info("{} -- channel={} address={}", __FUNCTION__, channel, address);
     tt_metal::detail::WriteToDeviceDRAMChannel(device, channel, address, data);
 }
-void readDramBackdoor(
+inline void readDramBackdoor(
     tt::tt_metal::Device* device, uint32_t channel, uint32_t address, uint32_t byte_size, std::vector<uint32_t>& data) {
     tt::log_info("{} -- channel={} address={} byte_size={}", __FUNCTION__, channel, address, byte_size);
     tt_metal::detail::ReadFromDeviceDRAMChannel(device, channel, address, byte_size, data);
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp b/tests/tt_metal/tt_metal/api/circular_buffer/circular_buffer_test_utils.hpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests/circular_buffer/circular_buffer_test_utils.hpp
rename to tests/tt_metal/tt_metal/api/circular_buffer/circular_buffer_test_utils.hpp
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp
rename to tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp
index 1df5ec9cdfd..04182323c6d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp
+++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp
@@ -6,7 +6,6 @@
 #include "gtest/gtest.h"
 #include "circular_buffer_test_utils.hpp"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/buffers/circular_buffer.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
 
@@ -39,7 +38,7 @@ void validate_cb_address(Program &program, Device *device, const CoreRangeSet &c
     }
 }
 
-TEST_F(DeviceFixture, TestCircularBuffersSequentiallyPlaced) {
+TEST_F(DeviceFixture, TensixTestCircularBuffersSequentiallyPlaced) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Program program;
     CBConfig cb_config;
@@ -66,7 +65,7 @@ TEST_F(DeviceFixture, TestCircularBuffersSequentiallyPlaced) {
 }
 }
 
-TEST_F(DeviceFixture, TestCircularBufferSequentialAcrossAllCores) {
+TEST_F(DeviceFixture, TensixTestCircularBufferSequentialAcrossAllCores) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Program program;
     CBConfig cb_config;
@@ -108,7 +107,7 @@ TEST_F(DeviceFixture, TestCircularBufferSequentialAcrossAllCores) {
 }
 }
 
-TEST_F(DeviceFixture, TestValidCircularBufferAddress) {
+TEST_F(DeviceFixture, TensixTestValidCircularBufferAddress) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Program program;
     CBConfig cb_config;
@@ -149,7 +148,7 @@ TEST_F(DeviceFixture, TestValidCircularBufferAddress) {
 }
 }
 
-TEST_F(DeviceFixture, TestCircularBuffersAndL1BuffersCollision) {
+TEST_F(DeviceFixture, TensixTestCircularBuffersAndL1BuffersCollision) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Program program;
     uint32_t page_size = TileSize(tt::DataFormat::Float16_b);
@@ -181,7 +180,7 @@ TEST_F(DeviceFixture, TestCircularBuffersAndL1BuffersCollision) {
 }
 }
 
-TEST_F(DeviceFixture, TestValidUpdateCircularBufferSize) {
+TEST_F(DeviceFixture, TensixTestValidUpdateCircularBufferSize) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Program program;
     CBConfig cb_config;
@@ -215,7 +214,7 @@ TEST_F(DeviceFixture, TestValidUpdateCircularBufferSize) {
 }
 }
 
-TEST_F(DeviceFixture, TestInvalidUpdateCircularBufferSize) {
+TEST_F(DeviceFixture, TensixTestInvalidUpdateCircularBufferSize) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Program program;
     CBConfig cb_config;
@@ -245,7 +244,7 @@ TEST_F(DeviceFixture, TestInvalidUpdateCircularBufferSize) {
 }
 }
 
-TEST_F(DeviceFixture, TestUpdateCircularBufferAddress) {
+TEST_F(DeviceFixture, TensixTestUpdateCircularBufferAddress) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Program program;
     CBConfig cb_config;
@@ -284,7 +283,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferAddress) {
 }
 }
 
-TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) {
+TEST_F(DeviceFixture, TensixTestUpdateCircularBufferPageSize) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Device *device = this->devices_.at(id);
     Program program;
@@ -360,7 +359,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) {
 }
 }
 
-TEST_F(DeviceFixture, TestDataCopyWithUpdatedCircularBufferConfig) {
+TEST_F(DeviceFixture, TensixTestDataCopyWithUpdatedCircularBufferConfig) {
   for (unsigned int id = 0; id < num_devices_; id++) {
     Program program;
     CoreCoord core(0, 0);
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp
similarity index 80%
rename from tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
rename to tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp
index 984dba24740..a54a77c2d1f 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp
+++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_creation.cpp
@@ -46,7 +46,7 @@ bool test_cb_config_written_to_core(Program &program, Device *device, const Core
     return pass;
 }
 
-TEST_F(DeviceFixture, TestCreateCircularBufferAtValidIndices) {
+TEST_F(DeviceFixture, TensixTestCreateCircularBufferAtValidIndices) {
     CBConfig cb_config;
 
     CoreRange cr({0, 0}, {0, 1});
@@ -68,12 +68,21 @@ TEST_F(DeviceFixture, TestCreateCircularBufferAtValidIndices) {
         {16, cb_config.data_format},
         {24, cb_config.data_format}
     };
-    CircularBufferConfig config = CircularBufferConfig(cb_config.page_size, data_format_spec)
-        .set_page_size(0, cb_config.page_size)
-        .set_page_size(2, cb_config.page_size)
-        .set_page_size(16, cb_config.page_size)
-        .set_page_size(24, cb_config.page_size);
-    auto cb = CreateCircularBuffer(program, cr_set, config);
+    CircularBufferConfig expected_config = CircularBufferConfig(cb_config.page_size, data_format_spec)
+        .set_page_size(tt::CBIndex::c_0, cb_config.page_size)
+        .set_page_size(tt::CBIndex::c_2, cb_config.page_size)
+        .set_page_size(tt::CBIndex::c_16, cb_config.page_size)
+        .set_page_size(tt::CBIndex::c_24, cb_config.page_size);
+
+    CircularBufferConfig actual_config = CircularBufferConfig(cb_config.page_size);
+    actual_config.index(tt::CBIndex::c_0).set_page_size(cb_config.page_size).set_data_format(cb_config.data_format);
+    actual_config.index(tt::CBIndex::c_2).set_page_size(cb_config.page_size).set_data_format(cb_config.data_format);
+    actual_config.index(tt::CBIndex::c_16).set_page_size(cb_config.page_size).set_data_format(cb_config.data_format);
+    actual_config.index(tt::CBIndex::c_24).set_page_size(cb_config.page_size).set_data_format(cb_config.data_format);
+
+    EXPECT_TRUE(actual_config == expected_config);
+
+    auto cb = CreateCircularBuffer(program, cr_set, actual_config);
 
     for (unsigned int id = 0; id < num_devices_; id++) {
         detail::CompileProgram(devices_.at(id), program);
@@ -95,7 +104,7 @@ TEST_F(DeviceFixture, TestCreateCircularBufferWithMismatchingConfig) {
     EXPECT_ANY_THROW(CircularBufferConfig(cb_config.page_size, {{0, cb_config.data_format}}).set_page_size(1, cb_config.page_size));
 }
 
-TEST_F(DeviceFixture, TestCreateCircularBufferAtOverlappingIndex) {
+TEST_F(DeviceFixture, TensixTestCreateCircularBufferAtOverlappingIndex) {
     Program program;
     CBConfig cb_config;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_non_blocking.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp
similarity index 98%
rename from tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_non_blocking.cpp
rename to tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp
index c10b6c6e0db..0a4b0849ca4 100644
--- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_non_blocking.cpp
+++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_non_blocking.cpp
@@ -61,7 +61,7 @@ std::vector<uint32_t> generate_rt_args(uint32_t master_semaphore, uint32_t slave
     return rt_args;
 }
 
-TEST_F(DeviceFixture, TestCircularBufferNonBlockingAPIs) {
+TEST_F(DeviceFixture, TensixTestCircularBufferNonBlockingAPIs) {
     Program program;
     Device *device = devices_.at(0);
 
diff --git a/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp
new file mode 100644
index 00000000000..bf1f6403018
--- /dev/null
+++ b/tests/tt_metal/tt_metal/api/compile_program_with_kernel_path_env_var_fixture.hpp
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include "host_api.hpp"
+#include "logger.hpp"
+
+using namespace tt;
+
+class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test {
+   protected:
+    void SetUp() override {
+        if (!this->are_preconditions_satisfied()) {
+            GTEST_SKIP();
+        }
+
+        const chip_id_t device_id = 0;
+        this->device_ = CreateDevice(device_id);
+        this->program_ = CreateProgram();
+    }
+
+    void TearDown() override {
+        if (!IsSkipped()) {
+            CloseDevice(this->device_);
+        }
+    }
+
+    void create_kernel(const string &kernel_file) {
+        CoreCoord core(0, 0);
+        tt_metal::CreateKernel(
+            this->program_,
+            kernel_file,
+            core,
+            tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
+    }
+
+    void setup_kernel_dir(const string &orig_kernel_file, const string &new_kernel_file) {
+        const string &kernel_dir = llrt::OptionsG.get_kernel_dir();
+        const std::filesystem::path &kernel_file_path_under_kernel_dir(kernel_dir + new_kernel_file);
+        const std::filesystem::path &dirs_under_kernel_dir = kernel_file_path_under_kernel_dir.parent_path();
+        std::filesystem::create_directories(dirs_under_kernel_dir);
+
+        const string &metal_root = llrt::OptionsG.get_root_dir();
+        const std::filesystem::path &kernel_file_path_under_metal_root(metal_root + orig_kernel_file);
+        std::filesystem::copy(kernel_file_path_under_metal_root, kernel_file_path_under_kernel_dir);
+    }
+
+    void cleanup_kernel_dir() {
+        const string &kernel_dir = llrt::OptionsG.get_kernel_dir();
+        for (const std::filesystem::directory_entry &entry : std::filesystem::directory_iterator(kernel_dir)) {
+            std::filesystem::remove_all(entry);
+        }
+    }
+
+    Device *device_;
+    Program program_;
+
+   private:
+    bool are_preconditions_satisfied() {
+        return this->are_env_vars_set() && this->is_kernel_dir_valid();
+    }
+
+    bool are_env_vars_set() {
+        bool are_set = true;
+        if (!llrt::OptionsG.is_root_dir_specified()) {
+            log_info(LogTest, "Skipping test: TT_METAL_HOME must be set");
+            are_set = false;
+        }
+        if (!llrt::OptionsG.is_kernel_dir_specified()) {
+            log_info(LogTest, "Skipping test: TT_METAL_KERNEL_PATH must be set");
+            are_set = false;
+        }
+        return are_set;
+    }
+
+    bool is_kernel_dir_valid() {
+        bool is_valid = true;
+        const string &kernel_dir = llrt::OptionsG.get_kernel_dir();
+        if (!this->does_path_exist(kernel_dir) || !this->is_path_a_directory(kernel_dir) ||
+            !this->is_dir_empty(kernel_dir)) {
+            log_info(LogTest, "Skipping test: TT_METAL_KERNEL_PATH must be an existing, empty directory");
+            is_valid = false;
+        }
+        return is_valid;
+    }
+
+    bool does_path_exist(const string &path) {
+        const std::filesystem::path &file_path(path);
+        return std::filesystem::exists(file_path);
+    }
+
+    bool is_path_a_directory(const string &path) {
+        TT_FATAL(this->does_path_exist(path), "{} does not exist", path);
+        const std::filesystem::path &file_path(path);
+        return std::filesystem::is_directory(file_path);
+    }
+
+    bool is_dir_empty(const string &path) {
+        TT_FATAL(this->does_path_exist(path), "{} does not exist", path);
+        TT_FATAL(this->is_path_a_directory(path), "{} is not a directory", path);
+        const std::filesystem::path &file_path(path);
+        return std::filesystem::is_empty(file_path);
+    }
+};
diff --git a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp b/tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp
similarity index 89%
rename from tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp
rename to tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp
index 596c5f44a73..5b2148498df 100644
--- a/tests/tt_metal/tt_metal/unit_tests/common/core_coord_fixture.hpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/core_coord_fixture.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -6,9 +6,8 @@
 
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
 
-class CoreCoordHarness : public ::testing::Test {
+class CoreCoordFixture : public ::testing::Test {
    protected:
     CoreRange cr1 = CoreRange({0, 0}, {1, 1});
     CoreRange cr2 = CoreRange({3, 3}, {5, 4});
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp
similarity index 86%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp
index 04cdc9f15c2..a1aedf9403e 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_construct.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_construct.cpp
@@ -8,7 +8,7 @@
 
 namespace basic_tests::CoreRangeSet{
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetValidConstruct)
+TEST_F(CoreCoordFixture, TestCoreRangeSetValidConstruct)
 {
     EXPECT_NO_THROW(::CoreRangeSet(std::vector{this->sc1, this->cr2}));
     EXPECT_NO_THROW(::CoreRangeSet(std::vector{this->cr1, this->cr2}));
@@ -17,7 +17,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeSetValidConstruct)
     EXPECT_EQ(valid_ranges.ranges().size(), 2);
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetInvalidConstruct)
+TEST_F(CoreCoordFixture, TestCoreRangeSetInvalidConstruct)
 {
     ::CoreRange overlapping_range({1, 2}, {3, 3});
     EXPECT_ANY_THROW(::CoreRangeSet(std::vector{this->cr1, this->cr2, overlapping_range}));
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_contains.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp
similarity index 96%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_contains.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp
index a76802a2e35..6a3eefbf03f 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_contains.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_contains.cpp
@@ -10,7 +10,7 @@
 
 namespace basic_tests::CoreRangeSet {
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetContains) {
+TEST_F(CoreCoordFixture, TestCoreRangeSetContains) {
     // Contains CoreCoord
     EXPECT_TRUE(::CoreRangeSet(this->cr1).contains(this->cr5.start_coord));
     EXPECT_TRUE(::CoreRangeSet(this->cr5).contains(this->cr1.end_coord));
@@ -33,7 +33,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeSetContains) {
     EXPECT_TRUE(::CoreRangeSet(this->cr12).contains(::CoreRangeSet(std::vector{this->sc6, this->cr11})));
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetNotContains) {
+TEST_F(CoreCoordFixture, TestCoreRangeSetNotContains) {
     // Not Contains CoreCoord
     EXPECT_FALSE(::CoreRangeSet(this->cr1).contains(this->cr2.start_coord));
     EXPECT_FALSE(
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_intersects.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp
similarity index 96%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_intersects.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp
index 3815de620e2..fb1f406412f 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_intersects.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_intersects.cpp
@@ -10,7 +10,7 @@
 
 namespace basic_tests::CoreRangeSet {
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetIntersects) {
+TEST_F(CoreCoordFixture, TestCoreRangeSetIntersects) {
     // Intersects CoreCoord
     EXPECT_TRUE(::CoreRangeSet(this->cr1).intersects(this->cr5.start_coord));
     EXPECT_TRUE(::CoreRangeSet(this->cr5).intersects(this->cr1.end_coord));
@@ -32,7 +32,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeSetIntersects) {
     EXPECT_TRUE(::CoreRangeSet(this->sc2).intersects(::CoreRangeSet(std::vector{this->cr7, this->cr1})));
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetNotIntersects) {
+TEST_F(CoreCoordFixture, TestCoreRangeSetNotIntersects) {
     // Not Intersects CoreCoord
     EXPECT_FALSE(::CoreRangeSet(this->cr1).intersects(this->cr2.start_coord));
     EXPECT_FALSE(
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp
similarity index 73%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp
index 778e5083538..2adfb440ba2 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRangeSet_merge.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRangeSet_merge.cpp
@@ -11,43 +11,43 @@
 
 namespace basic_tests::CoreRangeSet {
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetMergeNoSolution) {
+TEST_F(CoreCoordFixture, TestCoreRangeSetMergeNoSolution) {
     EXPECT_EQ(::CoreRangeSet(sc1).merge(std::set{sc3}), ::CoreRangeSet(std::set{sc1, sc3}));
     EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr2}), ::CoreRangeSet(std::set{cr1, cr2}));
     EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr1, cr2}), ::CoreRangeSet(std::set{cr1, cr2}));
     EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr2}).merge(std::set{cr3}), ::CoreRangeSet(std::set{cr1, cr2, cr3}));
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreCoord) {
+TEST_F(CoreCoordFixture, TestCoreRangeSetMergeCoreCoord) {
     ::CoreRangeSet empty_crs;
     EXPECT_EQ(empty_crs.merge(std::set{this->sc1}).ranges().size(), 1);
     EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{sc3, sc4}), ::CoreRangeSet(std::set{cr16}));
     EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{sc3}).merge(std::set{sc4}), ::CoreRangeSet(std::set{cr16}));
-    CoreRange rect({0, 0}, {4, 2});
-    std::set<CoreRange> rect_pts;
+    ::CoreRange rect({0, 0}, {4, 2});
+    std::set<::CoreRange> rect_pts;
     for (unsigned y = rect.start_coord.y; y <= rect.end_coord.y; y++) {
         for (unsigned x = rect.start_coord.x; x <= rect.end_coord.x; x++) {
-            rect_pts.insert(CoreRange({x, y}, {x, y}));
+            rect_pts.insert(::CoreRange({x, y}, {x, y}));
         }
     }
     EXPECT_EQ(empty_crs.merge(rect_pts), ::CoreRangeSet(std::set{rect}));
 
     // upside-down "T"
-    rect_pts.insert({CoreRange({2, 0}, {3, 5})});
-    EXPECT_EQ(empty_crs.merge(rect_pts), ::CoreRangeSet(std::set{rect, CoreRange({2, 3}, {3, 5})}));
+    rect_pts.insert({::CoreRange({2, 0}, {3, 5})});
+    EXPECT_EQ(empty_crs.merge(rect_pts), ::CoreRangeSet(std::set{rect, ::CoreRange({2, 3}, {3, 5})}));
 
     // "H", sub-optimal currently, should be reduced down to 3 CRs instead of 5
     EXPECT_EQ(
-        empty_crs.merge(std::vector{CoreRange{{0, 0}, {1, 5}}, CoreRange{{3, 0}, {4, 5}}, CoreRange{{0, 2}, {4, 3}}}),
+        empty_crs.merge(std::vector{::CoreRange{{0, 0}, {1, 5}}, ::CoreRange{{3, 0}, {4, 5}}, ::CoreRange{{0, 2}, {4, 3}}}),
         ::CoreRangeSet(std::set{
-            CoreRange{{0, 0}, {1, 1}},
-            CoreRange{{0, 2}, {4, 3}},
-            CoreRange{{0, 4}, {1, 5}},
-            CoreRange{{3, 0}, {4, 1}},
-            CoreRange{{3, 4}, {4, 5}}}));
+            ::CoreRange{{0, 0}, {1, 1}},
+            ::CoreRange{{0, 2}, {4, 3}},
+            ::CoreRange{{0, 4}, {1, 5}},
+            ::CoreRange{{3, 0}, {4, 1}},
+            ::CoreRange{{3, 4}, {4, 5}}}));
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeSetMergeCoreRange) {
+TEST_F(CoreCoordFixture, TestCoreRangeSetMergeCoreRange) {
     EXPECT_EQ(::CoreRangeSet(cr1).merge(std::set{cr1}), ::CoreRangeSet(std::set{cr1}));
     EXPECT_EQ(::CoreRangeSet(cr7).merge(std::set{cr6}).merge(std::set{cr4}), ::CoreRangeSet(std::set{cr8}));
     EXPECT_EQ(
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp
similarity index 91%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp
index f08976402d6..f9b386a71ea 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_adjacent.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_adjacent.cpp
@@ -10,7 +10,7 @@
 
 namespace basic_tests::CoreRange{
 
-TEST_F(CoreCoordHarness, TestCoreRangeAdjacent)
+TEST_F(CoreCoordFixture, TestCoreRangeAdjacent)
 {
     EXPECT_TRUE ( this->cr1.adjacent(this->cr9) );
     EXPECT_TRUE ( this->cr9.adjacent(this->cr1) );
@@ -23,7 +23,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeAdjacent)
 
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeNotAdjacent){
+TEST_F(CoreCoordFixture, TestCoreRangeNotAdjacent){
     EXPECT_FALSE ( this->cr2.adjacent(this->cr3));
     EXPECT_FALSE ( this->cr1.adjacent(this->cr6));
     EXPECT_FALSE ( this->cr1.adjacent(this->cr11));
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp
similarity index 94%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp
index a9369eb7445..905b59123fe 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_contains.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_contains.cpp
@@ -10,7 +10,7 @@
 
 namespace basic_tests::CoreRange {
 
-TEST_F(CoreCoordHarness, TestCoreRangeContains) {
+TEST_F(CoreCoordFixture, TestCoreRangeContains) {
     // Contains CoreCoord
     EXPECT_TRUE(this->cr1.contains(this->sc1.start_coord));
     EXPECT_TRUE(this->cr1.contains(this->cr1.start_coord));
@@ -25,7 +25,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeContains) {
     EXPECT_TRUE(this->cr4.contains(::CoreRangeSet(std::vector{this->cr1, this->cr2, this->cr3})));
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeNotContains) {
+TEST_F(CoreCoordFixture, TestCoreRangeNotContains) {
     // Not Contains CoreCoord
     EXPECT_FALSE(this->sc1.contains(this->cr1.start_coord));
     EXPECT_FALSE(this->sc1.contains(this->sc2.start_coord));
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp
similarity index 90%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp
index 29eeeb591b8..409bf123f26 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_intersects.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_intersects.cpp
@@ -8,7 +8,7 @@
 
 namespace basic_tests::CoreRange {
 
-TEST_F(CoreCoordHarness, TestCoreRangeIntersects) {
+TEST_F(CoreCoordFixture, TestCoreRangeIntersects) {
     EXPECT_TRUE(this->cr1.intersects(this->cr5));
     EXPECT_EQ(this->cr1.intersection(this->cr5).value(), ::CoreRange({1, 0}, {1, 1}));
 
@@ -25,7 +25,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeIntersects) {
     EXPECT_EQ(this->cr7.intersection(this->cr8).value(), this->cr7);
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeNotIntersects) {
+TEST_F(CoreCoordFixture, TestCoreRangeNotIntersects) {
     EXPECT_FALSE(this->cr1.intersects(this->cr2));
     EXPECT_FALSE(this->sc1.intersects(this->cr2));
     EXPECT_FALSE(this->cr1.intersects(this->cr7));
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp
index 5729e1a6c4b..7b41528637c 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_iterator.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_iterator.cpp
@@ -11,7 +11,7 @@ using std::vector;
 
 namespace basic_tests::CoreRange {
 
-TEST_F(CoreCoordHarness, TestCoreRangeIterator)
+TEST_F(CoreCoordFixture, TestCoreRangeIterator)
 {
     vector<CoreCoord> cores_in_core_range;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp
similarity index 92%
rename from tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp
rename to tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp
index db8a1b2c7ad..a3da8c8b90d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/core_coord/test_CoreRange_merge.cpp
+++ b/tests/tt_metal/tt_metal/api/core_coord/test_CoreRange_merge.cpp
@@ -10,7 +10,7 @@
 
 namespace basic_tests::CoreRange{
 
-TEST_F(CoreCoordHarness, TestCoreRangeMerge)
+TEST_F(CoreCoordFixture, TestCoreRangeMerge)
 {
     EXPECT_EQ ( this->sc1.merge(this->sc1).value(), this->sc1 );
     EXPECT_EQ ( this->cr4.merge(this->cr5).value(), this->cr6 );
@@ -25,7 +25,7 @@ TEST_F(CoreCoordHarness, TestCoreRangeMerge)
 
 }
 
-TEST_F(CoreCoordHarness, TestCoreRangeNotMergeable){
+TEST_F(CoreCoordFixture, TestCoreRangeNotMergeable){
     EXPECT_FALSE ( this->cr1.merge(this->cr3));
     EXPECT_FALSE ( this->cr2.merge(this->cr3));
     EXPECT_FALSE ( this->cr1.merge(this->cr6));
diff --git a/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp b/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp
new file mode 100644
index 00000000000..54a015a8146
--- /dev/null
+++ b/tests/tt_metal/tt_metal/api/test_CommandQueue.cpp
@@ -0,0 +1,151 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "gtest/gtest.h"
+
+#include "command_queue_fixture.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/common/scoped_timer.hpp"
+#include "tt_metal/impl/device/device.hpp"
+#include "tt_metal/impl/buffers/circular_buffer.hpp"
+#include "tt_metal/test_utils/stimulus.hpp"
+
+using namespace tt::tt_metal;
+
+namespace host_tests {
+
+TEST_F(CommandQueueMultiDeviceFixture, DISABLED_TestAccessCommandQueue) {
+    for (unsigned int device_id = 0; device_id < num_devices_; device_id++) {
+        EXPECT_NO_THROW(devices_[device_id]->command_queue());
+    }
+}
+
+TEST_F(CommandQueueFixture, TestCannotAccessCommandQueueForClosedDevice) {
+    EXPECT_NO_THROW(device_->command_queue());
+    CloseDevice(device_);
+    EXPECT_ANY_THROW(device_->command_queue());
+}
+
+TEST_F(CommandQueueFixture, DISABLED_TensixTestAsyncAssertForDeprecatedAPI) {
+    auto &command_queue = this->device_->command_queue();
+    auto current_mode = CommandQueue::default_mode();
+    command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC);
+    Program program;
+    CoreCoord core = {0, 0};
+    uint32_t buf_size = 4096;
+    uint32_t page_size = 4096;
+    auto dummy_kernel = CreateKernel(
+        program,
+        "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp",
+        core,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
+    auto src0 = Buffer::create(this->device_, buf_size, page_size, BufferType::DRAM);
+    std::vector<uint32_t> runtime_args = {src0->address()};
+    try {
+        SetRuntimeArgs(program, dummy_kernel, core, runtime_args);
+    } catch (std::runtime_error &e) {
+        std::string expected =
+            "This variant of SetRuntimeArgs can only be called when Asynchronous SW Command Queues are disabled for "
+            "Fast Dispatch.";
+        const string error = string(e.what());
+        EXPECT_TRUE(error.find(expected) != std::string::npos);
+    }
+    command_queue.set_mode(current_mode);
+}
+
+TEST_F(CommandQueueProgramFixture, TensixTestAsyncCommandQueueSanityAndProfile) {
+    auto& command_queue = this->device_->command_queue();
+    auto current_mode = CommandQueue::default_mode();
+    command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC);
+    Program program;
+
+    CoreRange cr({0, 0}, {0, 0});
+    CoreRangeSet cr_set({cr});
+    // Add an NCRISC blank manually, but in compile program, the BRISC blank will be
+    // added separately
+    auto dummy_reader_kernel = CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp",
+        cr_set,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
+    // Use scoper timer to benchmark time for pushing 2 commands
+    {
+        tt::ScopedTimer timer("AsyncCommandQueue");
+        EnqueueProgram(command_queue, program, false);
+        Finish(command_queue);
+    }
+    command_queue.set_mode(current_mode);
+}
+
+TEST_F(CommandQueueBufferFixture, DISABLED_TensixTestAsyncCBAllocation) {
+    // Test asynchronous allocation of buffers and their assignment to CBs
+    auto& command_queue = this->device_->command_queue();
+    auto current_mode = CommandQueue::default_mode();
+    command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC);
+    Program program;
+
+    const uint32_t num_pages = 1;
+    const uint32_t page_size = detail::TileSize(tt::DataFormat::Float16_b);
+    const tt::DataFormat data_format = tt::DataFormat::Float16_b;
+
+    auto buffer_size = page_size;
+    tt::tt_metal::InterleavedBufferConfig buff_config{
+                    .device=this->device_,
+                    .size = buffer_size,
+                    .page_size = buffer_size,
+                    .buffer_type = tt::tt_metal::BufferType::L1
+        };
+    // Asynchronously allocate an L1 Buffer
+    auto l1_buffer = CreateBuffer(buff_config);
+    CoreRange cr({0, 0}, {0, 2});
+    CoreRangeSet cr_set({cr});
+    std::vector<uint8_t> buffer_indices = {16, 24};
+
+    CircularBufferConfig config1 = CircularBufferConfig(page_size, {{buffer_indices[0], data_format}, {buffer_indices[1], data_format}}, *l1_buffer)
+        .set_page_size(buffer_indices[0], page_size)
+        .set_page_size(buffer_indices[1], page_size);
+    // Asynchronously assign the L1 Buffer to the CB
+    auto multi_core_cb = CreateCircularBuffer(program, cr_set, config1);
+    auto cb_ptr = detail::GetCircularBuffer(program, multi_core_cb);
+    Finish(this->device_->command_queue());
+    // Addresses should match
+    EXPECT_EQ(cb_ptr->address(), l1_buffer->address());
+    // Asynchronously allocate a new L1 buffer
+    auto l1_buffer_2 = CreateBuffer(buff_config);
+    // Asynchronously update CB address to match new L1 buffer
+    UpdateDynamicCircularBufferAddress(program, multi_core_cb, *l1_buffer_2);
+    Finish(this->device_->command_queue());
+    // Addresses should match
+    EXPECT_EQ(cb_ptr->address(), l1_buffer_2->address());
+    command_queue.set_mode(current_mode);
+}
+
+TEST_F(CommandQueueMultiDeviceFixture, DISABLED_TestDirectedLoopbackToUniqueHugepage) {
+    std::unordered_map<chip_id_t, std::vector<uint32_t>> golden_data;
+
+    const uint32_t byte_size = 2048 * 16;
+    const uint64_t address = 0;
+
+    for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) {
+        std::vector<uint32_t> data =
+            tt::test_utils::generate_uniform_random_vector<uint32_t>(0, UINT32_MAX, byte_size / sizeof(uint32_t));
+
+        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
+        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
+        tt::Cluster::instance().write_sysmem(
+            data.data(), data.size() * sizeof(uint32_t), address, mmio_device_id, channel);
+
+        golden_data[device_id] = data;
+    }
+
+    std::vector<uint32_t> readback_data;
+    readback_data.resize(byte_size / sizeof(uint32_t));
+    for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) {
+        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
+        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
+        tt::Cluster::instance().read_sysmem(readback_data.data(), byte_size, address, mmio_device_id, channel);
+        EXPECT_EQ(readback_data, golden_data.at(device_id));
+    }
+}
+}  // namespace host_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp b/tests/tt_metal/tt_metal/api/test_banked.cpp
similarity index 93%
rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
rename to tests/tt_metal/tt_metal/api/test_banked.cpp
index 9f1d68e7440..0479cf75db4 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_banked.cpp
+++ b/tests/tt_metal/tt_metal/api/test_banked.cpp
@@ -264,14 +264,14 @@ detail::LaunchProgram(device, program);
 
 }   // end namespace local_test_functions
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1ReaderOnly) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedL1ReaderOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         EXPECT_TRUE(local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, false));
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderOnly) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         TT_FATAL(this->devices_.at(id)->num_banks(BufferType::L1) % 2 == 0, "Error");
@@ -289,7 +289,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderOnly) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderOnly) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedDramReaderOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         test_config.input_buffer_type = BufferType::DRAM;
@@ -298,7 +298,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderOnly) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderOnly) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         TT_FATAL(this->devices_.at(id)->num_banks(BufferType::DRAM) % 2 == 0, "Error");
@@ -318,14 +318,14 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderOnly) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1WriterOnly) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedL1WriterOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         EXPECT_TRUE(local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, false, true));
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1WriterOnly) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1WriterOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         TT_FATAL(this->devices_.at(id)->num_banks(BufferType::L1) % 2 == 0, "Error");
@@ -343,7 +343,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1WriterOnly) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramWriterOnly) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedDramWriterOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         test_config.input_buffer_type = BufferType::DRAM;
@@ -352,7 +352,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramWriterOnly) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramWriterOnly) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramWriterOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         TT_FATAL(this->devices_.at(id)->num_banks(BufferType::DRAM) % 2 == 0, "Error");
@@ -372,14 +372,14 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramWriterOnly) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1ReaderAndWriter) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedL1ReaderAndWriter) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         EXPECT_TRUE(local_test_functions::reader_cb_writer(this->devices_.at(id), test_config, true, true));
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderAndWriter) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderAndWriter) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1);
@@ -397,7 +397,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderAndWriter) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderAndWriter) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedDramReaderAndWriter) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         test_config.input_buffer_type = BufferType::DRAM;
@@ -406,7 +406,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderAndWriter) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderAndWriter) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderAndWriter) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1);
@@ -426,7 +426,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderAndWriter) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderAndL1Writer) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedDramReaderAndL1Writer) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         test_config.input_buffer_type = BufferType::DRAM;
@@ -434,7 +434,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedDramReaderAndL1Writer) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderAndL1Writer) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderAndL1Writer) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         test_config.input_buffer_type = BufferType::DRAM;
@@ -454,7 +454,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderAndL1Writer) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1ReaderAndDramWriter) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileBankedL1ReaderAndDramWriter) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         test_config.output_buffer_type = BufferType::DRAM;
@@ -462,7 +462,7 @@ TEST_F(DeviceFixture, TestSingleCoreSingleTileBankedL1ReaderAndDramWriter) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderAndDramWriter) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderAndDramWriter) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         test_config.output_buffer_type = BufferType::DRAM;
@@ -482,7 +482,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderAndDramWriter) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderDataCopyL1Writer) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyL1Writer) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1);
@@ -501,7 +501,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderDataCopyL1Writer) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::DRAM);
@@ -521,7 +521,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderDataCopyDramWriter)
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1);
@@ -542,7 +542,7 @@ TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedL1ReaderDataCopyDramWriter) {
     }
 }
 
-TEST_F(DeviceFixture, TestSingleCoreMultiTileBankedDramReaderDataCopyL1Writer) {
+TEST_F(DeviceFixture, TensixTestSingleCoreMultiTileBankedDramReaderDataCopyL1Writer) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         BankedConfig test_config;
         size_t num_tiles = this->devices_.at(id)->num_banks(BufferType::L1);
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_bit_utils.cpp b/tests/tt_metal/tt_metal/api/test_bit_utils.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests_common/common/test_bit_utils.cpp
rename to tests/tt_metal/tt_metal/api/test_bit_utils.cpp
index 99f62def780..badbcaa7bca 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_bit_utils.cpp
+++ b/tests/tt_metal/tt_metal/api/test_bit_utils.cpp
@@ -6,7 +6,7 @@
 #include <gtest/gtest.h>
 #include <cstdint>
 
-TEST(NoFixture, ExtractBitArray) {
+TEST(Host, ExtractBitArray) {
     uint32_t src[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0};
     // 1. Extract the 20-bit elements from the 32-bit source array.
     uint32_t dest[4];
@@ -25,7 +25,7 @@ TEST(NoFixture, ExtractBitArray) {
     EXPECT_EQ(dest[3], 0x9abc);
 }
 
-TEST(NoFixture, PackBitArray) {
+TEST(Host, PackBitArray) {
     uint32_t src[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
     uint32_t dest[8];
 
@@ -56,7 +56,7 @@ TEST(NoFixture, PackBitArray) {
     EXPECT_EQ(dest[0], expected);
 }
 
-TEST(NoFixture, PackExtractBitArray) {
+TEST(Host, PackExtractBitArray) {
     uint32_t src[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
 
     for (uint num_pack_bits = 3; num_pack_bits <= 31; num_pack_bits++) {
@@ -70,7 +70,7 @@ TEST(NoFixture, PackExtractBitArray) {
     }
 }
 
-TEST(NoFixture, ExtractPackBitArray) {
+TEST(Host, ExtractPackBitArray) {
     uint32_t src[4] = { 0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0 };
 
     // Compute the number of 3-bit elements that can be packed into 4 x 32-bit elements
diff --git a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp b/tests/tt_metal/tt_metal/api/test_direct.cpp
similarity index 98%
rename from tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
rename to tests/tt_metal/tt_metal/api/test_direct.cpp
index 5c86e8feadf..3defe49c6ec 100644
--- a/tests/tt_metal/tt_metal/unit_tests/dram/direct.cpp
+++ b/tests/tt_metal/tt_metal/api/test_direct.cpp
@@ -372,7 +372,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter
 }
 }  // namespace unit_tests::dram::direct
 
-TEST_F(DeviceFixture, SingleCoreDirectDramReaderOnly) {
+TEST_F(DeviceFixture, TensixSingleCoreDirectDramReaderOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         uint32_t l1_unreserved_base = devices_.at(id)->get_base_allocator_addr(HalMemType::L1);
         ASSERT_TRUE(
@@ -383,7 +383,7 @@ TEST_F(DeviceFixture, SingleCoreDirectDramReaderOnly) {
             unit_tests::dram::direct::reader_only(devices_.at(id), 16 * 1024, l1_unreserved_base, CoreCoord(0, 0)));
     }
 }
-TEST_F(DeviceFixture, SingleCoreDirectDramWriterOnly) {
+TEST_F(DeviceFixture, TensixSingleCoreDirectDramWriterOnly) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         uint32_t l1_unreserved_base = devices_.at(id)->get_base_allocator_addr(HalMemType::L1);
         ASSERT_TRUE(
@@ -394,7 +394,7 @@ TEST_F(DeviceFixture, SingleCoreDirectDramWriterOnly) {
             unit_tests::dram::direct::writer_only(devices_.at(id), 16 * 1024, l1_unreserved_base, CoreCoord(0, 0)));
     }
 }
-TEST_F(DeviceFixture, SingleCoreDirectDramReaderWriter) {
+TEST_F(DeviceFixture, TensixSingleCoreDirectDramReaderWriter) {
     unit_tests::dram::direct::ReaderWriterConfig test_config = {
         .num_tiles = 1,
         .tile_byte_size = 2 * 32 * 32,
@@ -409,7 +409,7 @@ TEST_F(DeviceFixture, SingleCoreDirectDramReaderWriter) {
         ASSERT_TRUE(unit_tests::dram::direct::reader_writer(devices_.at(id), test_config));
     }
 }
-TEST_F(DeviceFixture, SingleCoreDirectDramReaderDatacopyWriter) {
+TEST_F(DeviceFixture, TensixSingleCoreDirectDramReaderDatacopyWriter) {
     unit_tests::dram::direct::ReaderDatacopyWriterConfig test_config = {
         .num_tiles = 1,
         .tile_byte_size = 2 * 32 * 32,
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram.cpp b/tests/tt_metal/tt_metal/api/test_dram.cpp
similarity index 93%
rename from tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram.cpp
rename to tests/tt_metal/tt_metal/api/test_dram.cpp
index 9b2f241bb72..463a6f4dfb1 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram.cpp
+++ b/tests/tt_metal/tt_metal/api/test_dram.cpp
@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
@@ -24,7 +24,7 @@ struct DRAMConfig{
     tt_metal::DataMovementConfig data_movement_cfg;
 };
 
-bool dram_single_core_db (CommonFixture* fixture, tt_metal::Device *device){
+bool dram_single_core_db (DispatchFixture* fixture, tt_metal::Device *device){
     tt_metal::Program program = tt_metal::CreateProgram();
 
     CoreCoord core = {0, 0};
@@ -90,7 +90,7 @@ bool dram_single_core_db (CommonFixture* fixture, tt_metal::Device *device){
     return input_vec == result_vec;
 }
 
-bool dram_single_core (CommonFixture* fixture, tt_metal::Device *device, const DRAMConfig &cfg, std::vector<uint32_t> src_vec){
+bool dram_single_core (DispatchFixture* fixture, tt_metal::Device *device, const DRAMConfig &cfg, std::vector<uint32_t> src_vec){
     // Create a program
     tt_metal::Program program = CreateProgram();
 
@@ -139,7 +139,7 @@ bool dram_single_core (CommonFixture* fixture, tt_metal::Device *device, const D
 }
 }
 
-TEST_F(CommonFixture, DRAMLoopbackSingleCore){
+TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCore){
     uint32_t buffer_size = 2 * 1024 * 25;
     std::vector<uint32_t> src_vec = create_random_vector_of_bfloat16(
         buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
@@ -155,8 +155,8 @@ TEST_F(CommonFixture, DRAMLoopbackSingleCore){
     }
 }
 
-TEST_F(CommonFixture, DRAMLoopbackSingleCoreDB){
-    if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
+TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCoreDB){
+    if (!this->IsSlowDispatch()){
         tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode");
         GTEST_SKIP();
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp
similarity index 94%
rename from tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp
rename to tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp
index 39bac896ea0..3f770ff57ca 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dram/test_dram_to_l1_multicast.cpp
+++ b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp
@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
@@ -23,7 +23,7 @@ struct DRAMtoL1MulticastConfig{
     CoreCoord exclude_direction;
 };
 
-bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, const DRAMtoL1MulticastConfig &cfg){
+bool dram_to_l1_multicast(DispatchFixture* fixture, tt_metal::Device *device, const DRAMtoL1MulticastConfig &cfg){
     bool pass = true;
     tt_metal::Program program = tt_metal::CreateProgram();
 
@@ -123,7 +123,7 @@ bool dram_to_l1_multicast(CommonFixture* fixture, tt_metal::Device *device, cons
 }
 }
 
-TEST_F(CommonFixture, DRAMtoL1Multicast){
+TEST_F(DispatchFixture, TensixDRAMtoL1Multicast){
     unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
         .dest_buffer_addr = 200 * 1024,
         .target_grid_offset = 1,
@@ -133,7 +133,7 @@ TEST_F(CommonFixture, DRAMtoL1Multicast){
         ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config));
     }
 }
-TEST_F(CommonFixture, DRAMtoL1MulticastLoopbackSrc){
+TEST_F(DispatchFixture, TensixDRAMtoL1MulticastLoopbackSrc){
     unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
         .dest_buffer_addr = 500 * 1024,
         .target_grid_offset = 0,
@@ -143,7 +143,7 @@ TEST_F(CommonFixture, DRAMtoL1MulticastLoopbackSrc){
         ASSERT_TRUE(unit_tests_common::dram::test_dram_to_l1_multicast::dram_to_l1_multicast(this, devices_.at(id), test_config));
     }
 }
-TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpLeft){
+TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionUpLeft){
     unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
         .dest_buffer_addr = 200 * 1024,
         .target_grid_offset = 0, //source core is in exclusion zone, don't count twice
@@ -160,7 +160,7 @@ TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpLeft){
     }
 }
 
-TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpRight){
+TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionUpRight){
     unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
         .dest_buffer_addr = 200 * 1024,
         .target_grid_offset = 1,
@@ -177,7 +177,7 @@ TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionUpRight){
     }
 }
 
-TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownLeft){
+TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionDownLeft){
     unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
         .dest_buffer_addr = 200 * 1024,
         .target_grid_offset = 1,
@@ -194,7 +194,7 @@ TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownLeft){
     }
 }
 
-TEST_F(CommonFixture, DRAMtoL1MulticastExcludeRegionDownRight){
+TEST_F(DispatchFixture, TensixDRAMtoL1MulticastExcludeRegionDownRight){
     unit_tests_common::dram::test_dram_to_l1_multicast::DRAMtoL1MulticastConfig test_config = {
         .dest_buffer_addr = 200 * 1024,
         .target_grid_offset = 1,
diff --git a/tests/tt_metal/tt_metal/unit_tests/global_semaphore/test_global_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests/global_semaphore/test_global_semaphores.cpp
rename to tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
diff --git a/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp b/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp
new file mode 100644
index 00000000000..19e8b4826af
--- /dev/null
+++ b/tests/tt_metal/tt_metal/api/test_kernel_creation.cpp
@@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "gtest/gtest.h"
+#include "common/core_coord.hpp"
+#include "dispatch_fixture.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "host_api.hpp"
+#include "compile_program_with_kernel_path_env_var_fixture.hpp"
+
+using namespace tt;
+
+// Ensures we can successfully create kernels on available compute grid
+TEST_F(DispatchFixture, TensixCreateKernelsOnComputeCores) {
+    for (unsigned int id = 0; id < this->devices_.size(); id++) {
+        tt_metal::Program program = CreateProgram();
+        CoreCoord compute_grid = this->devices_.at(id)->compute_with_storage_grid_size();
+        EXPECT_NO_THROW(
+            auto test_kernel = tt_metal::CreateKernel(
+                program,
+                "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
+                CoreRange(CoreCoord(0, 0), CoreCoord(compute_grid.x, compute_grid.y)),
+                DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}););
+    }
+}
+
+TEST_F(DispatchFixture, DISABLED_TensixCreateKernelsOnStorageCores) {
+    for (unsigned int id = 0; id < this->devices_.size(); id++) {
+        if (this->devices_.at(id)->storage_only_cores().empty()) {
+            GTEST_SKIP() << "This test only runs on devices with storage only cores";
+        }
+        tt_metal::Program program = CreateProgram();
+        const std::set<CoreCoord>& storage_only_cores = this->devices_.at(id)->storage_only_cores();
+        std::set<CoreRange> storage_only_core_ranges;
+        for (CoreCoord core : storage_only_cores) {
+            storage_only_core_ranges.emplace(core);
+        }
+        CoreRangeSet storage_core_range_set(storage_only_core_ranges);
+        EXPECT_ANY_THROW(
+            auto test_kernel = tt_metal::CreateKernel(
+                program,
+                "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
+                storage_core_range_set,
+                DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}););
+    }
+}
+
+TEST_F(DispatchFixture, DISABLED_TensixIdleEthCreateKernelsOnDispatchCores) {
+    if (this->IsSlowDispatch()) {
+        GTEST_SKIP() << "This test is only supported in fast dispatch mode";
+    }
+    for (unsigned int id = 0; id < this->devices_.size(); id++) {
+        tt_metal::Program program = CreateProgram();
+        Device* device = this->devices_.at(id);
+        CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
+        std::vector<CoreCoord> dispatch_cores = tt::get_logical_dispatch_cores(device->id(), device->num_hw_cqs(), dispatch_core_type);
+        std::set<CoreRange> dispatch_core_ranges;
+        for (CoreCoord core : dispatch_cores) {
+            dispatch_core_ranges.emplace(core);
+        }
+        CoreRangeSet dispatch_core_range_set(dispatch_core_ranges);
+        if (dispatch_core_type == CoreType::WORKER) {
+            EXPECT_ANY_THROW(
+                auto test_kernel = tt_metal::CreateKernel(
+                    program,
+                    "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
+                    CoreRangeSet(dispatch_core_range_set),
+                    DataMovementConfig{.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}););
+        } else if (dispatch_core_type == CoreType::ETH) {
+            EXPECT_ANY_THROW(auto test_kernel = tt_metal::CreateKernel(
+                                 program,
+                                 "tests/tt_metal/tt_metal/test_kernels/misc/erisc_print.cpp",
+                                 CoreRangeSet(dispatch_core_range_set),
+                                 EthernetConfig{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0}););
+        }
+    }
+}
+
+TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixKernelUnderMetalRootDir) {
+    const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp";
+    create_kernel(kernel_file);
+    detail::CompileProgram(this->device_, this->program_);
+}
+
+TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixKernelUnderKernelRootDir) {
+    const string &orig_kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp";
+    const string &new_kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/new_kernel.cpp";
+    this->setup_kernel_dir(orig_kernel_file, new_kernel_file);
+    this->create_kernel(new_kernel_file);
+    detail::CompileProgram(this->device_, this->program_);
+    this->cleanup_kernel_dir();
+}
+
+TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixKernelUnderMetalRootDirAndKernelRootDir) {
+    const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp";
+    this->setup_kernel_dir(kernel_file, kernel_file);
+    this->create_kernel(kernel_file);
+    detail::CompileProgram(this->device_, this->program_);
+    this->cleanup_kernel_dir();
+}
+
+TEST_F(CompileProgramWithKernelPathEnvVarFixture, TensixNonExistentKernel) {
+    const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/non_existent_kernel.cpp";
+    this->create_kernel(kernel_file);
+    EXPECT_THROW(detail::CompileProgram(this->device_, this->program_), std::exception);
+}
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/test_noc.cpp b/tests/tt_metal/tt_metal/api/test_noc.cpp
similarity index 78%
rename from tests/tt_metal/tt_metal/unit_tests/basic/test_noc.cpp
rename to tests/tt_metal/tt_metal/api/test_noc.cpp
index 278e5289e28..65640d96a39 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/test_noc.cpp
+++ b/tests/tt_metal/tt_metal/api/test_noc.cpp
@@ -4,17 +4,10 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "basic_fixture.hpp"
 #include "device_fixture.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include "host_api.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt;
 using namespace tt::test_utils;
@@ -68,11 +61,48 @@ void read_translation_table (Device* device, CoreCoord logical_node, std::vector
 #endif
 }
 
-}  // namespace unit_tests::basic::device
+}  // namespace unit_tests::basic::test_noc
 
+TEST(NOC, TensixSingleDeviceHarvestingPrints) {
+    auto arch = tt::get_arch_from_string(get_umd_arch_name());
+    tt::tt_metal::Device* device;
+    const unsigned int device_id = 0;
+    device = tt::tt_metal::CreateDevice(device_id);
+    CoreCoord unharvested_logical_grid_size;
+    switch (arch) {
+        case tt::ARCH::GRAYSKULL: unharvested_logical_grid_size = CoreCoord(12, 10);  break;
+        case tt::ARCH::WORMHOLE_B0: unharvested_logical_grid_size = CoreCoord(8, 10); break;
+        case tt::ARCH::BLACKHOLE: unharvested_logical_grid_size = CoreCoord(14, 10); break;
+        default:
+            TT_THROW("Unsupported arch {}", get_umd_arch_name());
+    }
+    auto logical_grid_size = device->logical_grid_size();
+    if (logical_grid_size == unharvested_logical_grid_size) {
+        tt::log_info("Harvesting Disabled in SW");
+    } else {
+        tt::log_info("Harvesting Enabled in SW");
+        tt::log_info("Number of Harvested Rows={}", unharvested_logical_grid_size.y - logical_grid_size.y);
+    }
 
+    tt::log_info("Logical -- Noc Coordinates Mapping");
+    tt::log_info("[Logical <-> NOC0] Coordinates");
+    for (int r = 0; r < logical_grid_size.y; r++) {
+        string output_row = "";
+        for (int c = 0; c < logical_grid_size.x; c++) {
+            const CoreCoord logical_coord(c, r);
+            const auto noc_coord = device->worker_core_from_logical_core(logical_coord);
+            output_row += "{L[x" + std::to_string(c);
+            output_row += "-y" + std::to_string(r);
+            output_row += "]:N[x" + std::to_string(noc_coord.x);
+            output_row += "-y" + std::to_string(noc_coord.y);
+            output_row += "]}, ";
+        }
+        tt::log_info("{}", output_row);
+    }
+    ASSERT_TRUE(tt::tt_metal::CloseDevice(device));
+}
 
-TEST_F(BasicFixture, VerifyNocNodeIDs) {
+TEST(NOC, TensixVerifyNocNodeIDs) {
     auto arch = tt::get_arch_from_string(get_umd_arch_name());
     tt::tt_metal::Device* device;
     const unsigned int device_id = 0;
@@ -95,7 +125,7 @@ TEST_F(BasicFixture, VerifyNocNodeIDs) {
     }
     ASSERT_TRUE(tt::tt_metal::CloseDevice(device));
 }
-TEST_F(BasicFixture, VerifyNocIdentityTranslationTable) {
+TEST(NOC, TensixVerifyNocIdentityTranslationTable) {
     auto arch = tt::get_arch_from_string(get_umd_arch_name());
     if (arch == tt::ARCH::BLACKHOLE) {
         GTEST_SKIP();
@@ -133,14 +163,12 @@ TEST_F(BasicFixture, VerifyNocIdentityTranslationTable) {
 
 // Tests that kernel can write to and read from a stream register address
 // This is meant to exercise noc_inline_dw_write API
-TEST_F(DeviceFixture, DirectedStreamRegWriteRead) {
+TEST_F(DeviceFixture, TensixDirectedStreamRegWriteRead) {
     CoreCoord start_core{0, 0};
     const uint32_t stream_id = 0;
     const uint32_t stream_reg = 4;
 
     for (tt_metal::Device *device : this->devices_) {
-        std::set<CoreCoord> storage_only_cores = device->storage_only_cores();
-
         tt_metal::Program program = tt_metal::CreateProgram();
         CoreCoord logical_grid_size = device->compute_with_storage_grid_size();
         CoreCoord end_core{logical_grid_size.x - 1, logical_grid_size.y - 1};
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp b/tests/tt_metal/tt_metal/api/test_runtime_args.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
rename to tests/tt_metal/tt_metal/api/test_runtime_args.cpp
index 520d04986d2..31d83f4783f 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/runtime_args.cpp
+++ b/tests/tt_metal/tt_metal/api/test_runtime_args.cpp
@@ -4,16 +4,10 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
 #include "device_fixture.hpp"
-
+#include "kernels/kernel.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-
 
 using namespace tt;
 using namespace tt::tt_metal;
@@ -168,7 +162,7 @@ bool verify_results(
 }
 
 // Write unique and common runtime args to device and readback to verify written correctly.
-TEST_F(DeviceFixture, LegallyModifyRTArgsDataMovement) {
+TEST_F(DeviceFixture, TensixLegallyModifyRTArgsDataMovement) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
@@ -214,7 +208,7 @@ TEST_F(DeviceFixture, LegallyModifyRTArgsDataMovement) {
     }
 }
 
-TEST_F(DeviceFixture, LegallyModifyRTArgsCompute) {
+TEST_F(DeviceFixture, TensixLegallyModifyRTArgsCompute) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
@@ -244,7 +238,7 @@ TEST_F(DeviceFixture, LegallyModifyRTArgsCompute) {
 }
 
 // Don't cover all cores of kernel with SetRuntimeArgs. Verify that correct offset used to access common runtime args.
-TEST_F(DeviceFixture, SetRuntimeArgsSubsetOfCoresCompute) {
+TEST_F(DeviceFixture, TensixSetRuntimeArgsSubsetOfCoresCompute) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
@@ -272,7 +266,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsSubsetOfCoresCompute) {
 }
 
 // Different unique runtime args per core. Not overly special, but verify that it works.
-TEST_F(DeviceFixture, SetRuntimeArgsUniqueValuesCompute) {
+TEST_F(DeviceFixture, TensixSetRuntimeArgsUniqueValuesCompute) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
@@ -305,7 +299,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsUniqueValuesCompute) {
 
 // Some cores have more unique runtime args than others. Unused in kernel, but API supports it, so verify it works and that
 // common runtime args are appropriately offset by amount from core(s) with most unique runtime args.
-TEST_F(DeviceFixture, SetRuntimeArgsVaryingLengthPerCore) {
+TEST_F(DeviceFixture, TensixSetRuntimeArgsVaryingLengthPerCore) {
 
     for (unsigned int id = 0; id < num_devices_; id++) {
         // First run the program with the initial runtime args
@@ -356,7 +350,7 @@ TEST_F(DeviceFixture, SetRuntimeArgsVaryingLengthPerCore) {
 }
 
 // Too many unique and common runtime args, overflows allowed space and throws expected exception from both unique/common APIs.
-TEST_F(DeviceFixture, IllegalTooManyRuntimeArgs) {
+TEST_F(DeviceFixture, TensixIllegalTooManyRuntimeArgs) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         CoreRange first_core_range(CoreCoord(1, 1), CoreCoord(2, 2));
         CoreRangeSet core_range_set(first_core_range);
@@ -376,7 +370,7 @@ TEST_F(DeviceFixture, IllegalTooManyRuntimeArgs) {
     }
 }
 
-TEST_F(DeviceFixture, IllegallyModifyRTArgs) {
+TEST_F(DeviceFixture, TensixIllegallyModifyRTArgs) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         // First run the program with the initial runtime args
         CoreRange first_core_range(CoreCoord(0, 0), CoreCoord(1, 1));
@@ -408,7 +402,6 @@ TEST_F(DeviceFixture, IllegallyModifyRTArgs) {
         SetCommonRuntimeArgs(program, 0, common_runtime_args);
         std::vector<uint32_t> illegal_common_runtime_args = {0, 1, 2, 3, 4, 5};
         EXPECT_ANY_THROW(SetCommonRuntimeArgs(program, 0, illegal_common_runtime_args));
-
     }
 }
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_semaphores.cpp
similarity index 94%
rename from tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
rename to tests/tt_metal/tt_metal/api/test_semaphores.cpp
index 9be219332a0..7cb446a3648 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp
+++ b/tests/tt_metal/tt_metal/api/test_semaphores.cpp
@@ -4,10 +4,6 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
 #include "device_fixture.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/detail/util.hpp"
@@ -22,13 +18,13 @@ void initialize_program(tt_metal::Device *device, tt_metal::Program &program, co
     uint32_t single_tile_size = tt_metal::detail::TileSize(tt::DataFormat::Float16_b);
     uint32_t num_tiles = 2048;
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 8;
     tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core_range, cb_src0_config);
 
-    uint32_t ouput_cb_index = 16;  // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 1;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_size);
@@ -102,7 +98,7 @@ void try_creating_more_than_max_num_semaphores(
 
 }  // namespace unit_tests::initialize_semaphores
 
-TEST_F(DeviceFixture, InitializeLegalSemaphores) {
+TEST_F(DeviceFixture, TensixInitializeLegalSemaphores) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         tt_metal::Program program = tt_metal::CreateProgram();
         CoreRange core_range({0, 0}, {1, 1});
@@ -111,7 +107,7 @@ TEST_F(DeviceFixture, InitializeLegalSemaphores) {
     }
 }
 
-TEST_F(DeviceFixture, InitializeIllegalSemaphores) {
+TEST_F(DeviceFixture, TensixInitializeIllegalSemaphores) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         tt_metal::Program program = tt_metal::CreateProgram();
         CoreRange core_range({0, 0}, {1, 1});
@@ -121,7 +117,7 @@ TEST_F(DeviceFixture, InitializeIllegalSemaphores) {
     }
 }
 
-TEST_F(DeviceFixture, CreateMultipleSemaphoresOnSameCore) {
+TEST_F(DeviceFixture, TensixCreateMultipleSemaphoresOnSameCore) {
     tt_metal::Program program = tt_metal::CreateProgram();
 
     CoreCoord core0(0,0);
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_sharded_l1.cpp b/tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_sharded_l1.cpp
rename to tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp
index 9b59692df8f..27c68e515de 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_sharded_l1.cpp
+++ b/tests/tt_metal/tt_metal/api/test_sharded_l1_buffer.cpp
@@ -4,16 +4,10 @@
 
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
 #include "tt_metal/common/constants.hpp"
-#include <optional>
+#include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt::tt_metal;
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_dram_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_dram_buffer.cpp
rename to tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp
index 27eadb2448e..32c5e8e9255 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_dram_buffer.cpp
+++ b/tests/tt_metal/tt_metal/api/test_simple_dram_buffer.cpp
@@ -4,14 +4,10 @@
 
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
-#include "test_buffer_utils.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
+#include "buffer_test_utils.hpp"
+#include "host_api.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
-
 using tt::tt_metal::Device;
 using namespace tt::test_utils;
 using namespace tt::test::buffer::detail;
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp
rename to tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp
index 80146a83635..4c3cfbf3a11 100644
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_simple_l1_buffer.cpp
+++ b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp
@@ -4,12 +4,9 @@
 
 #include "device_fixture.hpp"
 #include "gtest/gtest.h"
-#include "test_buffer_utils.hpp"
-#include "tt_metal/host_api.hpp"
+#include "buffer_test_utils.hpp"
+#include "host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
 
@@ -160,7 +157,7 @@ TEST_F(DeviceFixture, TestSimpleL1BufferWriteOnlyHi) {
     }
 }
 
-TEST_F(DeviceFixture, TestSimpleL1ReadWriteTileLo) {
+TEST_F(DeviceFixture, TensixTestSimpleL1ReadWriteTileLo) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         size_t lo_address = 768 * 1024;
         ASSERT_TRUE(SimpleTiledL1WriteCBRead(
@@ -172,7 +169,7 @@ TEST_F(DeviceFixture, TestSimpleL1ReadWriteTileLo) {
     }
 }
 
-TEST_F(DeviceFixture, TestSimpleL1ReadWriteTileHi) {
+TEST_F(DeviceFixture, TensixTestSimpleL1ReadWriteTileHi) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         size_t hi_address = this->devices_.at(id)->l1_size_per_core() - (24 * 1024);
         ASSERT_TRUE(SimpleTiledL1WriteCBRead(
@@ -184,7 +181,7 @@ TEST_F(DeviceFixture, TestSimpleL1ReadWriteTileHi) {
     }
 }
 
-TEST_F(DeviceFixture, TestSimpleL1ReadWritex2y2TileLo) {
+TEST_F(DeviceFixture, TensixTestSimpleL1ReadWritex2y2TileLo) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         size_t lo_address = 768 * 1024;
         ASSERT_TRUE(SimpleTiledL1WriteCBRead(
@@ -196,7 +193,7 @@ TEST_F(DeviceFixture, TestSimpleL1ReadWritex2y2TileLo) {
     }
 }
 
-TEST_F(DeviceFixture, TestSimpleL1ReadWritex2y2TileHi) {
+TEST_F(DeviceFixture, TensixTestSimpleL1ReadWritex2y2TileHi) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         size_t hi_address = this->devices_.at(id)->l1_size_per_core() - (24 * 1024);
         ASSERT_TRUE(SimpleTiledL1WriteCBRead(
@@ -208,7 +205,7 @@ TEST_F(DeviceFixture, TestSimpleL1ReadWritex2y2TileHi) {
     }
 }
 
-TEST_F(DeviceFixture, TestBufferL1ReadWriteTileLo) {
+TEST_F(DeviceFixture, TensixTestBufferL1ReadWriteTileLo) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         size_t lo_address = 768 * 1024;
         ASSERT_TRUE(SimpleTiledL1WriteCBRead(
@@ -220,7 +217,7 @@ TEST_F(DeviceFixture, TestBufferL1ReadWriteTileLo) {
     }
 }
 
-TEST_F(DeviceFixture, TestBufferL1ReadWriteTileHi) {
+TEST_F(DeviceFixture, TensixTestBufferL1ReadWriteTileHi) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         size_t hi_address = this->devices_.at(id)->l1_size_per_core() - (24 * 1024);
         ASSERT_TRUE(SimpleTiledL1WriteCBRead(
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/test_soc_descriptor.cpp b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests/basic/test_soc_descriptor.cpp
rename to tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp
index 1fb0e630fd9..657f6996b23 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/test_soc_descriptor.cpp
+++ b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp
@@ -8,13 +8,10 @@
 #include <functional>
 #include <random>
 
-#include "basic_fixture.hpp"
 #include "device_fixture.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include "host_api.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt;
 using namespace tt::test_utils;
@@ -43,7 +40,7 @@ namespace unit_tests::basic::soc_desc {
 
 
 // This test ensures that no logical core maps to a harvested row
-TEST_F(BasicFixture, ValidateLogicalToPhysicalCoreCoordHostMapping) {
+TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) {
     size_t num_devices = tt_metal::GetNumAvailableDevices();
     ASSERT_TRUE(num_devices > 0);
     tt::ARCH arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
@@ -67,7 +64,7 @@ TEST_F(BasicFixture, ValidateLogicalToPhysicalCoreCoordHostMapping) {
     }
 }
 
-TEST_F(DeviceFixture, ValidateMetalSocDescriptors) {
+TEST_F(DeviceFixture, TensixValidateMetalSocDescriptors) {
     for (chip_id_t device_id = 0; device_id < this->num_devices_; device_id++) {
         const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(device_id);
 
diff --git a/tests/tt_metal/tt_metal/unit_tests/host_apis/test_tilize_untilize.cpp b/tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp
similarity index 84%
rename from tests/tt_metal/tt_metal/unit_tests/host_apis/test_tilize_untilize.cpp
rename to tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp
index 0c2379b4e19..040b69cc151 100644
--- a/tests/tt_metal/tt_metal/unit_tests/host_apis/test_tilize_untilize.cpp
+++ b/tests/tt_metal/tt_metal/api/test_tilize_untilize.cpp
@@ -4,7 +4,6 @@
 
 #include <gtest/gtest.h>
 #include <vector>
-#include "tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp"
 #include "tt_metal/common/tilize_untilize.hpp"
 
 template <bool tilize_first, typename T>
@@ -35,7 +34,7 @@ void tilize_untilize_helper(uint max_num_batches, uint max_num_row_tiles, uint m
 }
 
 // The following run the tilize/untilize APIs and their inverses
-TEST_F(BasicFixture, TestTilizeAndThenUntilizeBfloat16) {
+TEST(Host, TestTilizeAndThenUntilizeBfloat16) {
     uint max_num_batches = 8;
     uint max_num_row_tiles = 8;
     uint max_num_col_tiles = 8;
@@ -45,12 +44,12 @@ TEST_F(BasicFixture, TestTilizeAndThenUntilizeBfloat16) {
     tilize_untilize_helper<true, bfloat16>(max_num_batches, max_num_row_tiles, max_num_col_tiles, TILE_HEIGHT, TILE_WIDTH);
 }
 
-TEST_F(BasicFixture, TestTilizeThrowErrorForNonBfloat16DataType) {
+TEST(Host, TestTilizeThrowErrorForNonBfloat16DataType) {
     std::vector<float> vec(1024, 0);
     EXPECT_ANY_THROW(tilize(vec, 32, 32));
 }
 
-TEST_F(BasicFixture, TestTilizeThrowErrorForInvalidTileMandN) {
+TEST(Host, TestTilizeThrowErrorForInvalidTileMandN) {
     // m and n are not divisible by tile size
     std::vector<bfloat16> vec(16, 0);
     EXPECT_ANY_THROW(tilize(vec, 4, 4)); // m and n not divisible by 32
@@ -59,19 +58,19 @@ TEST_F(BasicFixture, TestTilizeThrowErrorForInvalidTileMandN) {
     EXPECT_ANY_THROW(tilize(vec, 0, 0));
 }
 
-TEST_F(BasicFixture, TestTilizeThrowErrorForInvalidVectorShape) {
+TEST(Host, TestTilizeThrowErrorForInvalidVectorShape) {
     std::vector<bfloat16> vec(16, 0); // Size not divisible by 1024
     EXPECT_ANY_THROW(tilize(vec, 32, 32)); // m and n not divisible by 32
     vec = {}; // Cannot have a zero vector either
     EXPECT_ANY_THROW(tilize(vec, 32, 32)); // m and n not divisible by 32
 }
 
-TEST_F(BasicFixture, TestUntilizeThrowErrorForNonBfloat16DataType) {
+TEST(Host, TestUntilizeThrowErrorForNonBfloat16DataType) {
     std::vector<float> vec(1024, 0);
     EXPECT_ANY_THROW(untilize(vec, 32, 32));
 }
 
-TEST_F(BasicFixture, TestUntilizeThrowErrorForInvalidTileMandN) {
+TEST(Host, TestUntilizeThrowErrorForInvalidTileMandN) {
     // m and n are not divisible by tile side lengths
     std::vector<bfloat16> vec(16, 0);
     EXPECT_ANY_THROW(untilize(vec, 4, 4));
@@ -80,14 +79,14 @@ TEST_F(BasicFixture, TestUntilizeThrowErrorForInvalidTileMandN) {
     EXPECT_ANY_THROW(untilize(vec, 0, 0));
 }
 
-TEST_F(BasicFixture, TestUntilizeThrowErrorForInvalidVectorShape) {
+TEST(Host, TestUntilizeThrowErrorForInvalidVectorShape) {
     std::vector<bfloat16> vec(16, 0); // Size not divisible by 1024
     EXPECT_ANY_THROW(untilize(vec, 32, 32)); // m and n not divisible by 32
     vec = {}; // Cannot have a zero vector either
     EXPECT_ANY_THROW(untilize(vec, 32, 32)); // m and n not divisible by 32
 }
 
-TEST_F(BasicFixture, TestUntilizeAndThenTilizeBfloat16) {
+TEST(Host, TestUntilizeAndThenTilizeBfloat16) {
     uint max_num_batches = 8;
     uint max_num_row_tiles = 8;
     uint max_num_col_tiles = 8;
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_worker_config_buffer.cpp b/tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp
similarity index 87%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_worker_config_buffer.cpp
rename to tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp
index e0c4083eced..3c3a31cf314 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_worker_config_buffer.cpp
+++ b/tests/tt_metal/tt_metal/api/test_worker_config_buffer.cpp
@@ -1,18 +1,16 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <memory>
-
 #include "gtest/gtest.h"
 #include "tt_metal/impl/dispatch/worker_config_buffer.hpp"
 
 using std::vector;
 using namespace tt::tt_metal;
 
-namespace working_config_buffer_tests {
+namespace worker_config_buffer_tests {
 
-TEST(WorkingConfigBuffer, MarkCompletelyFull) {
+TEST(WorkerConfigBuffer, MarkCompletelyFull) {
     WorkerConfigBufferMgr mgr;
     mgr.init_add_buffer(1024, 1024);
     mgr.init_add_buffer(2, 1024);
@@ -56,4 +54,4 @@ TEST(WorkerConfigBuffer, SmallSize) {
     }
 }
 
-}  // namespace working_config_buffer_tests
+}  // namespace worker_config_buffer_tests
diff --git a/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp
new file mode 100644
index 00000000000..a3a18fdc229
--- /dev/null
+++ b/tests/tt_metal/tt_metal/common/command_queue_fixture.hpp
@@ -0,0 +1,161 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "gtest/gtest.h"
+#include "dispatch_fixture.hpp"
+#include "hostdevcommon/common_values.hpp"
+#include "impl/device/device.hpp"
+#include "tt_cluster_descriptor_types.h"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/test_utils/env_vars.hpp"
+#include "tt_metal/impl/kernels/kernel.hpp"
+#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "tt_metal/llrt/rtoptions.hpp"
+
+class CommandQueueFixture : public DispatchFixture {
+   protected:
+    tt::tt_metal::Device *device_;
+    void SetUp() override {
+        this->validate_dispatch_mode();
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+        this->create_device();
+    }
+
+    void TearDown() override {
+        if (!this->IsSlowDispatch()) {
+            tt::tt_metal::CloseDevice(this->device_);
+        }
+    }
+
+    void validate_dispatch_mode() {
+        this->slow_dispatch_ = false;
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (slow_dispatch) {
+            tt::log_info(
+                tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
+            this->slow_dispatch_ = true;
+            GTEST_SKIP();
+        }
+    }
+
+    void create_device(const size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE) {
+        const chip_id_t device_id = 0;
+        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
+        this->device_ =
+            tt::tt_metal::CreateDevice(device_id, 1, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_type);
+    }
+};
+
+class CommandQueueEventFixture : public CommandQueueFixture {};
+
+class CommandQueueBufferFixture : public CommandQueueFixture {};
+
+class CommandQueueProgramFixture : public CommandQueueFixture {};
+
+class CommandQueueTraceFixture : public CommandQueueFixture {
+    protected:
+    void SetUp() override {
+        this->validate_dispatch_mode();
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+    }
+
+    void CreateDevice(const size_t trace_region_size) {
+        this->create_device(trace_region_size);
+    }
+};
+
+class CommandQueueSingleCardFixture : virtual public DispatchFixture {
+   protected:
+    void SetUp() override {
+        this->validate_dispatch_mode();
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+        this->create_devices();
+    }
+
+    void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }
+
+    void validate_dispatch_mode() {
+        this->slow_dispatch_ = false;
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (slow_dispatch) {
+            tt::log_info(
+                tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
+            this->slow_dispatch_ = false;
+            GTEST_SKIP();
+        }
+    }
+
+    void create_devices(const std::size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE) {
+        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
+        const chip_id_t mmio_device_id = 0;
+        this->reserved_devices_ = tt::tt_metal::detail::CreateDevices(
+            {mmio_device_id}, 1, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_type);
+        auto enable_remote_chip = getenv("TT_METAL_ENABLE_REMOTE_CHIP");
+        if (enable_remote_chip) {
+            for (const auto &[id, device] : this->reserved_devices_) {
+                this->devices_.push_back(device);
+            }
+        } else {
+            this->devices_.push_back(this->reserved_devices_.at(mmio_device_id));
+        }
+    }
+
+    std::vector<tt::tt_metal::Device *> devices_;
+    std::map<chip_id_t, tt::tt_metal::Device *> reserved_devices_;
+};
+
+class CommandQueueSingleCardBufferFixture : public CommandQueueSingleCardFixture {};
+
+class CommandQueueSingleCardTraceFixture : virtual public CommandQueueSingleCardFixture {
+   protected:
+    void SetUp() override {
+        this->validate_dispatch_mode();
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+        this->create_devices(90000000);
+    }
+};
+
+class CommandQueueSingleCardProgramFixture : virtual public CommandQueueSingleCardFixture {};
+
+class CommandQueueMultiDeviceFixture : public DispatchFixture {
+   protected:
+    void SetUp() override {
+        this->slow_dispatch_ = false;
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (slow_dispatch) {
+            tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
+            this->slow_dispatch_ = true;
+            GTEST_SKIP();
+        }
+
+        arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+
+        num_devices_ = tt::tt_metal::GetNumAvailableDevices();
+        if (num_devices_ < 2 ) {
+            GTEST_SKIP();
+        }
+
+        std::vector<chip_id_t> chip_ids;
+        for (unsigned int id = 0; id < num_devices_; id++) {
+            chip_ids.push_back(id);
+        }
+
+        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
+        reserved_devices_ = tt::tt_metal::detail::CreateDevices(chip_ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+        for (const auto &[id, device] : reserved_devices_) {
+            devices_.push_back(device);
+        }
+    }
+
+    void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }
+
+    std::vector<tt::tt_metal::Device*> devices_;
+    std::map<chip_id_t, tt::tt_metal::Device*> reserved_devices_;
+    size_t num_devices_;
+};
+
+class CommandQueueMultiDeviceProgramFixture : public CommandQueueMultiDeviceFixture {};
diff --git a/tests/tt_metal/tt_metal/common/device_fixture.hpp b/tests/tt_metal/tt_metal/common/device_fixture.hpp
new file mode 100644
index 00000000000..c0b086f07c8
--- /dev/null
+++ b/tests/tt_metal/tt_metal/common/device_fixture.hpp
@@ -0,0 +1,102 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "dispatch_fixture.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/test_utils/env_vars.hpp"
+#include "tt_metal/impl/device/device_pool.hpp"
+
+class DeviceFixture : public DispatchFixture {
+   protected:
+    void SetUp() override {
+        this->validate_dispatch_mode();
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+
+        // Some CI machines have lots of cards, running all tests on all cards is slow
+        // Coverage for multidevices is decent if we just confirm 2 work
+        this->num_devices_ = tt::tt_metal::GetNumAvailableDevices();
+        if (arch_ == tt::ARCH::GRAYSKULL && num_devices_ > 2) {
+            this->num_devices_ = 2;
+        }
+
+        std::vector<chip_id_t> ids;
+        for (unsigned int id = 0; id < num_devices_; id++) {
+            ids.push_back(id);
+        }
+        this->create_devices(ids);
+    }
+
+    void validate_dispatch_mode() {
+        this->slow_dispatch_ = true;
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (!slow_dispatch) {
+            tt::log_info(
+                tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE set");
+            this->slow_dispatch_ = false;
+            GTEST_SKIP();
+        }
+    }
+
+    void create_devices(const std::vector<chip_id_t>& device_ids) {
+        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
+        tt::DevicePool::initialize(device_ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+        this->devices_ = tt::DevicePool::instance().get_all_active_devices();
+        this->num_devices_ = this->devices_.size();
+    }
+
+    size_t num_devices_;
+};
+
+class DeviceSingleCardFixture : public DispatchFixture {
+   protected:
+    void SetUp() override {
+        this->validate_dispatch_mode();
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+        this->create_devices();
+    }
+
+    void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }
+
+    void validate_dispatch_mode() {
+        this->slow_dispatch_ = true;
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (!slow_dispatch) {
+            tt::log_info(
+                tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE set");
+            this->slow_dispatch_ = false;
+            GTEST_SKIP();
+        }
+    }
+
+    void create_devices() {
+        const chip_id_t mmio_device_id = 0;
+        this->reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id});
+        this->device_ = this->reserved_devices_.at(mmio_device_id);
+        this->devices_ = tt::DevicePool::instance().get_all_active_devices();
+        this->num_devices_ = this->reserved_devices_.size();
+    }
+
+    tt::tt_metal::Device *device_;
+    std::map<chip_id_t, tt::tt_metal::Device*> reserved_devices_;
+    size_t num_devices_;
+};
+
+class DeviceSingleCardBufferFixture : public DeviceSingleCardFixture {};
+
+class BlackholeSingleCardFixture : public DeviceSingleCardFixture {
+   protected:
+    void SetUp() override {
+        this->validate_dispatch_mode();
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name());
+        if (this->arch_ != tt::ARCH::BLACKHOLE) {
+            GTEST_SKIP();
+        }
+        this->create_devices();
+    }
+};
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp
similarity index 88%
rename from tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
rename to tests/tt_metal/tt_metal/common/dispatch_fixture.hpp
index 1b1b4d6104f..546311661f6 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp
+++ b/tests/tt_metal/tt_metal/common/dispatch_fixture.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -14,16 +14,24 @@
 #include "tt_metal/impl/device/device_pool.hpp"
 
 // A dispatch-agnostic test fixture
-class CommonFixture: public ::testing::Test {
-public:
+class DispatchFixture : public ::testing::Test {
+   public:
     // A function to run a program, according to which dispatch mode is set.
-    void RunProgram(tt::tt_metal::Device* device, tt::tt_metal::Program& program) {
+    void RunProgram(tt::tt_metal::Device* device, tt::tt_metal::Program& program, const bool skip_finish = false) {
         const uint64_t program_id = program.get_id();
         if (this->slow_dispatch_) {
             tt::tt_metal::detail::LaunchProgram(device, program);
         } else {
             tt::tt_metal::CommandQueue& cq = device->command_queue();
             tt::tt_metal::EnqueueProgram(cq, program, false);
+            if (!skip_finish) {
+                tt::tt_metal::Finish(cq);
+            }
+        }
+    }
+    void FinishCommands(tt::tt_metal::Device* device) {
+        if (!this->IsSlowDispatch()) {
+            tt::tt_metal::CommandQueue& cq = device->command_queue();
             tt::tt_metal::Finish(cq);
         }
     }
@@ -51,26 +59,12 @@ class CommonFixture: public ::testing::Test {
     tt::ARCH arch_;
     std::vector<tt::tt_metal::v1::DeviceHandle> devices_;
     bool slow_dispatch_;
-    bool has_remote_devices_;
 
     void SetUp() override {
-        // Skip for slow dispatch for now
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            tt::log_info(tt::LogTest, "Running test using Slow Dispatch");
-            slow_dispatch_ = true;
-        } else {
-            tt::log_info(tt::LogTest, "Running test using Fast Dispatch");
-            slow_dispatch_ = false;
-        }
-
+        this->DetectDispatchMode();
         // Set up all available devices
         this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
         auto num_devices = tt::tt_metal::GetNumAvailableDevices();
-        auto num_pci_devices = tt::tt_metal::GetNumPCIeDevices();
-        // An extra flag for if we have remote devices, as some tests are disabled for fast
-        // dispatch + remote devices.
-        this->has_remote_devices_ = num_devices > num_pci_devices;
         std::vector<chip_id_t> ids;
         for (unsigned int id = 0; id < num_devices; id++) {
             if (SkipTest(id))
@@ -118,4 +112,15 @@ class CommonFixture: public ::testing::Test {
         run_function();
         log_info(tt::LogTest, "Finished running test on device {}.", device->id());
     }
+
+    void DetectDispatchMode() {
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (slow_dispatch) {
+            tt::log_info(tt::LogTest, "Running test using Slow Dispatch");
+            this->slow_dispatch_ = true;
+        } else {
+            tt::log_info(tt::LogTest, "Running test using Fast Dispatch");
+            this->slow_dispatch_ = false;
+        }
+    }
 };
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp b/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp
rename to tests/tt_metal/tt_metal/common/matmul_test_utils.hpp
index 4eecec45c61..f1cc9436ccb 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp
+++ b/tests/tt_metal/tt_metal/common/matmul_test_utils.hpp
@@ -4,14 +4,9 @@
 
 #pragma once
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "tt_metal/host_api.hpp"
+#include "host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
-#include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "tt_metal/common/test_tiles.hpp"
 #include "hostdevcommon/common_values.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
diff --git a/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp
new file mode 100644
index 00000000000..7f0b4c7a17d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/common/multi_device_fixture.hpp
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "host_api.hpp"
+#include "dispatch_fixture.hpp"
+#include "tt_cluster_descriptor_types.h"
+#include "tt_metal/test_utils/env_vars.hpp"
+#include "tt_metal/impl/device/device_pool.hpp"
+
+class MultiDeviceFixture : public DispatchFixture {
+   protected:
+    void SetUp() override {
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+    }
+};
+
+class N300DeviceFixture : public MultiDeviceFixture {
+   protected:
+    void SetUp() override {
+        this->slow_dispatch_ = true;
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (!slow_dispatch) {
+            tt::log_info(tt::LogTest, "This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set");
+            this->slow_dispatch_ = false;
+            GTEST_SKIP();
+        }
+
+        MultiDeviceFixture::SetUp();
+
+        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
+        const size_t num_pci_devices = tt::tt_metal::GetNumPCIeDevices();
+        if (this->arch_ == tt::ARCH::WORMHOLE_B0 && num_devices == 2 && num_pci_devices == 1) {
+            std::vector<chip_id_t> ids;
+            for (chip_id_t id = 0; id < num_devices; id++) {
+                ids.push_back(id);
+            }
+
+            const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
+            tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+            this->devices_ = tt::DevicePool::instance().get_all_active_devices();
+        } else {
+            GTEST_SKIP();
+        }
+    }
+};
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt b/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt
similarity index 50%
rename from tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
rename to tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt
index 9f4257918f4..9a445e323d4 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/debug_tools/CMakeLists.txt
@@ -1,14 +1,4 @@
-set(UNIT_TESTS_COMMON_SRC
-    ${CMAKE_CURRENT_SOURCE_DIR}/basic/test_device_init.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/common/test_bit_utils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/common/test_dispatch.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_flatten.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_large_block.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_multi_core_X_dram.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_single_core.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/matmul/test_matmul_X_tile.cpp
+set(UNIT_TESTS_DEBUG_TOOLS_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_eth_cores.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_invalid_print_core.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_mute_device.cpp
@@ -17,36 +7,35 @@ set(UNIT_TESTS_COMMON_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_before_finish.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_hanging.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_tensix_dest.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_raise_wait.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_print_tiles.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dram/test_dram_to_l1_multicast.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dram/test_dram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/dprint/test_raise_wait.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_assert.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize_delays.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_noc_sanitize.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_pause.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_ringbuf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_waypoint.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/watcher/test_link_training.cpp
-)
-add_library(unit_tests_common_o STATIC ${UNIT_TESTS_COMMON_SRC})
-TT_ENABLE_UNITY_BUILD(unit_tests_common_o)
-target_link_libraries(
-    unit_tests_common_o
-    PUBLIC
-        gtest
-        gtest_main
-        magic_enum
-        fmt::fmt-header-only
-        span
 )
+
+add_executable(unit_tests_debug_tools ${UNIT_TESTS_DEBUG_TOOLS_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_debug_tools)
+
+target_link_libraries(unit_tests_debug_tools PUBLIC test_metal_common_libs)
 target_include_directories(
-    unit_tests_common_o
-    PUBLIC
-        $<TARGET_PROPERTY:Metalium::Metal,INTERFACE_INCLUDE_DIRECTORIES>
+    unit_tests_debug_tools
+    PRIVATE
         ${PROJECT_SOURCE_DIR}
         ${PROJECT_SOURCE_DIR}/tt_metal
         ${PROJECT_SOURCE_DIR}/tt_metal/common
         ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
         ${CMAKE_CURRENT_SOURCE_DIR}/common
 )
+set_target_properties(
+    unit_tests_debug_tools
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp b/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp
similarity index 53%
rename from tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp
rename to tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp
index 9d74f94942d..f8189d9c98e 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp
+++ b/tests/tt_metal/tt_metal/debug_tools/debug_tools_fixture.hpp
@@ -4,14 +4,110 @@
 
 #pragma once
 
-#include <chrono>
-#include <thread>
-#include "common_fixture.hpp"
-#include "impl/debug/watcher_server.hpp"
-#include "llrt/rtoptions.hpp"
-
-// A version of CommonFixture with watcher enabled
-class WatcherFixture: public CommonFixture {
+#include <gtest/gtest.h>
+#include "debug/watcher_server.hpp"
+#include "dispatch_fixture.hpp"
+#include "tt_metal/tt_metal/common/dispatch_fixture.hpp"
+
+class DebugToolsFixture : public DispatchFixture {
+   protected:
+    bool watcher_previous_enabled;
+
+    void TearDown() override {
+        DispatchFixture::TearDown();
+        tt::llrt::OptionsG.set_watcher_enabled(watcher_previous_enabled);
+    }
+
+    template <typename T>
+    void RunTestOnDevice(const std::function<void(T*, Device*)>& run_function, Device* device) {
+        auto run_function_no_args = [=]() { run_function(static_cast<T*>(this), device); };
+        DispatchFixture::RunTestOnDevice(run_function_no_args, device);
+    }
+};
+
+// A version of DispatchFixture with DPrint enabled on all cores.
+class DPrintFixture : public DebugToolsFixture {
+public:
+    inline static const string dprint_file_name = "gtest_dprint_log.txt";
+
+    // A function to run a program, according to which dispatch mode is set.
+    void RunProgram(Device* device, Program& program) {
+        // Only difference is that we need to wait for the print server to catch
+        // up after running a test.
+        DebugToolsFixture::RunProgram(device, program);
+        tt::DprintServerAwait();
+    }
+
+protected:
+    // Running with dprint + watcher enabled can make the code size blow up, so let's force watcher
+    // disabled for DPRINT tests.
+    void SetUp() override {
+        // The core range (physical) needs to be set >= the set of all cores
+        // used by all tests using this fixture, so set dprint enabled for
+        // all cores and all devices
+        tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true);
+        tt::llrt::OptionsG.set_feature_all_cores(
+            tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassWorker);
+        tt::llrt::OptionsG.set_feature_all_cores(
+            tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassWorker);
+        tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, true);
+        // Send output to a file so the test can check after program is run.
+        tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, dprint_file_name);
+        tt::llrt::OptionsG.set_test_mode_enabled(true);
+        watcher_previous_enabled = tt::llrt::OptionsG.get_watcher_enabled();
+        tt::llrt::OptionsG.set_watcher_enabled(false);
+
+        ExtraSetUp();
+
+        // Parent class initializes devices and any necessary flags
+        DebugToolsFixture::SetUp();
+    }
+
+    void TearDown() override {
+        // Parent class tears down devices
+        DebugToolsFixture::TearDown();
+
+        // Remove the DPrint output file after the test is finished.
+        std::remove(dprint_file_name.c_str());
+
+        // Reset DPrint settings
+        tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, {});
+        tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false);
+        tt::llrt::OptionsG.set_feature_all_cores(
+            tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassNoneSpecified);
+        tt::llrt::OptionsG.set_feature_all_cores(
+            tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassNoneSpecified);
+        tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false);
+        tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, "");
+        tt::llrt::OptionsG.set_test_mode_enabled(false);
+    }
+
+    void RunTestOnDevice(
+        const std::function<void(DPrintFixture*, Device*)>& run_function,
+        Device* device
+    ) {
+        DebugToolsFixture::RunTestOnDevice(run_function, device);
+        tt::DPrintServerClearLogFile();
+        tt::DPrintServerClearSignals();
+    }
+
+    // Override this function in child classes for additional setup commands between DPRINT setup
+    // and device creation.
+    virtual void ExtraSetUp() {}
+};
+
+// For usage by tests that need the dprint server devices disabled.
+class DPrintDisableDevicesFixture : public DPrintFixture {
+protected:
+    void ExtraSetUp() override {
+        // For this test, mute each devices using the environment variable
+        tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false);
+        tt::llrt::OptionsG.set_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint, {});
+    }
+};
+
+// A version of DispatchFixture with watcher enabled
+class WatcherFixture : public DebugToolsFixture {
 public:
     inline static const string log_file_name = "generated/watcher/watcher.log";
     inline static const int interval_ms = 250;
@@ -20,7 +116,7 @@ class WatcherFixture: public CommonFixture {
     void RunProgram(Device* device, Program& program, bool wait_for_dump = false) {
         // Only difference is that we need to wait for the print server to catch
         // up after running a test.
-        CommonFixture::RunProgram(device, program);
+        DebugToolsFixture::RunProgram(device, program);
 
         // Wait for watcher to run a full dump before finishing, need to wait for dump count to
         // increase because we'll likely check in the middle of a dump.
@@ -31,7 +127,6 @@ class WatcherFixture: public CommonFixture {
     }
 
 protected:
-    bool watcher_previous_enabled;
     int  watcher_previous_interval;
     bool watcher_previous_dump_all;
     bool watcher_previous_append;
@@ -57,15 +152,14 @@ class WatcherFixture: public CommonFixture {
         tt::watcher_clear_log();
 
         // Parent class initializes devices and any necessary flags
-        CommonFixture::SetUp();
+        DebugToolsFixture::SetUp();
     }
 
     void TearDown() override {
         // Parent class tears down devices
-        CommonFixture::TearDown();
+        DebugToolsFixture::TearDown();
 
         // Reset watcher settings to their previous values
-        tt::llrt::OptionsG.set_watcher_enabled(watcher_previous_enabled);
         tt::llrt::OptionsG.set_watcher_interval(watcher_previous_interval);
         tt::llrt::OptionsG.set_watcher_dump_all(watcher_previous_dump_all);
         tt::llrt::OptionsG.set_watcher_append(watcher_previous_append);
@@ -79,10 +173,7 @@ class WatcherFixture: public CommonFixture {
         const std::function<void(WatcherFixture*, Device*)>& run_function,
         Device* device
     ) {
-        auto run_function_no_args = [=]() {
-            run_function(this, device);
-        };
-        CommonFixture::RunTestOnDevice(run_function_no_args, device);
+        DebugToolsFixture::RunTestOnDevice(run_function, device);
         // Wait for a final watcher poll and then clear the log.
         std::this_thread::sleep_for(std::chrono::milliseconds(interval_ms));
         tt::watcher_clear_log();
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp b/tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp
similarity index 75%
rename from tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp
rename to tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp
index e7074237636..64359886fd8 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp
+++ b/tests/tt_metal/tt_metal/debug_tools/debug_tools_test_utils.hpp
@@ -3,61 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include <cstdint>
-#include <deque>
-#include "impl/kernels/kernel.hpp"
 
-inline std::pair<std::vector<uint32_t>, std::vector<uint32_t>> create_runtime_args(
-    const uint32_t num_unique_rt_args,
-    const uint32_t num_common_rt_args,
-    const uint32_t unique_base,
-    const uint32_t common_base) {
-    TT_FATAL(
-        num_unique_rt_args + num_common_rt_args <= tt::tt_metal::max_runtime_args,
-        "Number of unique runtime args and common runtime args exceeds the maximum limit of {} runtime args",
-        tt::tt_metal::max_runtime_args);
-
-    std::vector<uint32_t> common_rt_args;
-    for (uint32_t i = 0; i < num_common_rt_args; i++) {
-        common_rt_args.push_back(common_base + i);
-    }
-
-    std::vector<uint32_t> unique_rt_args;
-    for (uint32_t i = 0; i < num_unique_rt_args; i++) {
-        unique_rt_args.push_back(unique_base + i);
-    }
-
-    return std::make_pair(unique_rt_args, common_rt_args);
-}
-
-// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two.
-// Optionally force the max size for one of the vectors.
-inline std::pair<std::vector<uint32_t>, std::vector<uint32_t>> create_runtime_args(
-    const bool force_max_size = false, const uint32_t unique_base = 0, const uint32_t common_base = 100) {
-    uint32_t num_rt_args_unique = rand() % (tt::tt_metal::max_runtime_args + 1);
-    uint32_t num_rt_args_common =
-        num_rt_args_unique < tt::tt_metal::max_runtime_args ? rand() % (tt::tt_metal::max_runtime_args - num_rt_args_unique + 1) : 0;
-
-    if (force_max_size) {
-        if (rand() % 2) {
-            num_rt_args_unique = tt::tt_metal::max_runtime_args;
-            num_rt_args_common = 0;
-        } else {
-            num_rt_args_common = tt::tt_metal::max_runtime_args;
-            num_rt_args_unique = 0;
-        }
-    }
-
-    log_trace(
-        tt::LogTest,
-        "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}",
-        __FUNCTION__,
-        num_rt_args_unique,
-        num_rt_args_common,
-        force_max_size);
-
-    return create_runtime_args(num_rt_args_unique, num_rt_args_common, unique_base, common_base);
-}
+#include "host_api.hpp"
 
 // Helper function to open a file as an fstream, and check that it was opened properly.
 inline bool OpenFile(string &file_name, std::fstream &file_stream, std::ios_base::openmode mode) {
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp
index 38ece0f5ca0..e97d6eef743 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_eth_cores.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_eth_cores.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "dprint_fixture.hpp"
+#include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
-#include "test_utils.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -81,7 +81,7 @@ static void RunTest(DPrintFixture* fixture, Device* device, bool active) {
 }
 }
 
-TEST_F(DPrintFixture, TestPrintEthCores) {
+TEST_F(DPrintFixture, ActiveEthTestPrint) {
     for (Device* device : this->devices_) {
         // Skip if no ethernet cores on this device
         if (device->get_active_ethernet_cores(true).size() == 0) {
@@ -96,7 +96,7 @@ TEST_F(DPrintFixture, TestPrintEthCores) {
         );
     }
 }
-TEST_F(DPrintFixture, TestPrintIEthCores) {
+TEST_F(DPrintFixture, IdleEthTestPrint) {
     if (!this->IsSlowDispatch()) {
         log_info(tt::LogTest, "FD-on-idle-eth not supported.");
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_invalid_print_core.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp
similarity index 65%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_invalid_print_core.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp
index 11b89c90dfe..47ba5193765 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_invalid_print_core.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_invalid_print_core.cpp
@@ -2,9 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include "gtest/gtest.h"
+#include "debug_tools_fixture.hpp"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/llrt/rtoptions.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
@@ -12,22 +11,17 @@
 //////////////////////////////////////////////////////////////////////////////////////////
 using namespace tt::tt_metal;
 
-TEST(DPrintErrorChecking, TestPrintInvalidCore) {
+TEST_F(DPrintFixture, TensixTestPrintInvalidCore) {
     // Set DPRINT enabled on a mix of invalid and valid cores. Previously this would hang during
     // device setup, but not the print server should simply ignore the invalid cores.
     std::map<CoreType, std::vector<CoreCoord>> dprint_cores;
     dprint_cores[CoreType::WORKER] = {{0, 0}, {1, 1}, {100, 100}};
     tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, dprint_cores);
-    tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true);
-
-    const int device_id = 0;
-    Device* device = nullptr;
-    const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-    device = tt::tt_metal::CreateDevice(device_id, tt::llrt::OptionsG.get_num_hw_cqs(), DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
 
     // We expect that even though illegal worker cores were requested, device setup did not hang.
     // So just make sure that device setup worked and then close the device.
-    EXPECT_TRUE(device != nullptr);
+    for (Device* device : this->devices_) {
+        EXPECT_TRUE(device != nullptr);
+    }
     tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false);
-    tt::tt_metal::CloseDevice(device);
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp
similarity index 93%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp
index 8440e242ac6..7a8b4aa2822 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_device.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_device.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "dprint_fixture.hpp"
+#include "debug_tools_fixture.hpp"
 #include "common/bfloat16.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -48,7 +48,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
     Program program = Program();
 
     // Create a CB for testing TSLICE, dimensions are 32x32 bfloat16s
-    constexpr uint32_t src0_cb_index = CB::c_in0;
+    constexpr uint32_t src0_cb_index = CBIndex::c_0;
     constexpr uint32_t buffer_size = 32*32*sizeof(bfloat16);
     CircularBufferConfig cb_src0_config = CircularBufferConfig(
         buffer_size,
@@ -77,7 +77,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
 }
 }
 
-TEST_F(DPrintFixtureDisableDevices, TestPrintMuteDevice) {
+TEST_F(DPrintDisableDevicesFixture, TensixTestPrintMuteDevice) {
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp
similarity index 94%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp
index 3798288e27c..8d158464401 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_mute_print_server.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_mute_print_server.cpp
@@ -1,10 +1,10 @@
 // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "dprint_fixture.hpp"
+#include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
 #include "impl/debug/dprint_server.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -66,7 +66,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
 }
 }
 
-TEST_F(DPrintFixture, TestPrintMuting) {
+TEST_F(DPrintFixture, TensixTestPrintMuting) {
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp
similarity index 91%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp
index 42d7382b5bb..0b0a0026d4c 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_all_harts.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_all_harts.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "dprint_fixture.hpp"
+#include "debug_tools_fixture.hpp"
 #include "common/bfloat16.hpp"
 #include "gtest/gtest.h"
-#include "test_utils.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -42,7 +42,7 @@ HEX/OCT/DEC:
 0.245117188 0.249023438 0.255859375 0.263671875 0.98046875 0.99609375 1.0234375 1.0546875
 0.365234375 0.373046875 0.380859375 0.388671875 1.4609375 1.4921875 1.5234375 1.5546875
 <TileSlice data truncated due to exceeding max count (32)>
-Tried printing CB::c_in1: Unsupported data format (Bfp2_b)
+Tried printing CBIndex::c_1: Unsupported data format (Bfp2_b)
 Test Debug Print: Unpack
 Basic Types:
 101-1.61800337@0.122558594
@@ -67,7 +67,7 @@ HEX/OCT/DEC:
 0.245117188 0.249023438 0.255859375 0.263671875 0.98046875 0.99609375 1.0234375 1.0546875
 0.365234375 0.373046875 0.380859375 0.388671875 1.4609375 1.4921875 1.5234375 1.5546875
 <TileSlice data truncated due to exceeding max count (32)>
-Tried printing CB::c_in1: Unsupported data format (Bfp2_b)
+Tried printing CBIndex::c_1: Unsupported data format (Bfp2_b)
 Test Debug Print: Math
 Basic Types:
 101-1.61800337@0.122558594
@@ -110,7 +110,7 @@ HEX/OCT/DEC:
 0.245117188 0.249023438 0.255859375 0.263671875 0.98046875 0.99609375 1.0234375 1.0546875
 0.365234375 0.373046875 0.380859375 0.388671875 1.4609375 1.4921875 1.5234375 1.5546875
 <TileSlice data truncated due to exceeding max count (32)>
-Tried printing CB::c_in1: Unsupported data format (Bfp2_b)
+Tried printing CBIndex::c_1: Unsupported data format (Bfp2_b)
 Test Debug Print: Data1
 Basic Types:
 101-1.61800337@0.122558594
@@ -135,7 +135,7 @@ HEX/OCT/DEC:
 0.245117188 0.249023438 0.255859375 0.263671875 0.98046875 0.99609375 1.0234375 1.0546875
 0.365234375 0.373046875 0.380859375 0.388671875 1.4609375 1.4921875 1.5234375 1.5546875
 <TileSlice data truncated due to exceeding max count (32)>
-Tried printing CB::c_in1: Unsupported data format (Bfp2_b))";
+Tried printing CBIndex::c_1: Unsupported data format (Bfp2_b))";
 
 static void RunTest(DPrintFixture* fixture, Device* device) {
     // Set up program and command queue
@@ -146,15 +146,15 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
     constexpr uint32_t buffer_size = 32*32*sizeof(bfloat16);
     CircularBufferConfig cb_src0_config = CircularBufferConfig(
         buffer_size,
-        {{CB::c_in0, tt::DataFormat::Float16_b}}
-    ).set_page_size(CB::c_in0, buffer_size);
+        {{CBIndex::c_0, tt::DataFormat::Float16_b}}
+    ).set_page_size(CBIndex::c_0, buffer_size);
     CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
     // A CB with an unsupported data format
     CircularBufferConfig cb_src1_config = CircularBufferConfig(
         buffer_size,
-        {{CB::c_in1, tt::DataFormat::Bfp2_b}}
-    ).set_page_size(CB::c_in1, buffer_size);
+        {{CBIndex::c_1, tt::DataFormat::Bfp2_b}}
+    ).set_page_size(CBIndex::c_1, buffer_size);
     CBHandle cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
     // Three different kernels to mirror typical usage and some previously
@@ -192,7 +192,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
 }
 }
 
-TEST_F(DPrintFixture, TestPrintFromAllHarts) {
+TEST_F(DPrintFixture, TensixTestPrintFromAllHarts) {
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_before_finish.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_before_finish.cpp
index 0370b51f3f2..9e7a775b9c5 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_before_finish.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_before_finish.cpp
@@ -1,8 +1,8 @@
 // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-#include "dprint_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking that the finish command can wait for the last dprint.
@@ -58,7 +58,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
     );
 }
 
-TEST_F(DPrintFixture, TestPrintFinish) {
+TEST_F(DPrintFixture, TensixTestPrintFinish) {
     auto devices = this->devices_;
     // Run only on the first device, as this tests disconnects devices and this can cause
     // issues on multi-device setups.
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp
index a707ffff86c..00a0252e7a8 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_hanging.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_hanging.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "dprint_fixture.hpp"
+#include "debug_tools_fixture.hpp"
 #include "common/bfloat16.hpp"
 #include "gtest/gtest.h"
-#include "test_utils.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -56,7 +56,7 @@ try {
 }
 }
 
-TEST_F(DPrintFixture, TestPrintHanging) {
+TEST_F(DPrintFixture, TensixTestPrintHanging) {
     // Skip this test for slow dipatch for now. Due to how llrt currently sits below device, it's
     // tricky to check print server status from the finish loop for slow dispatch. Once issue #4363
     // is resolved, we should add a check for print server handing in slow dispatch as well.
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp
index 1f73a7bc736..7d8ec61dd30 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tensix_dest.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp
@@ -3,14 +3,12 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "common/bfloat16.hpp"
-#include "dprint_fixture.hpp"
+#include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
-#include "test_utils.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking dprint
@@ -230,7 +228,7 @@ static bool reader_datacopy_writer(
     return input_data == output_data;
 }
 
-TEST_F(DPrintFixture, TestDestPrintFloat16b) {
+TEST_F(DPrintFixture, TensixTestDestPrintFloat16b) {
     // Setup test configuration
     DestPrintTestConfig test_config = {
         .num_tiles = 2,
@@ -246,7 +244,7 @@ TEST_F(DPrintFixture, TestDestPrintFloat16b) {
         this->devices_[0]);
 }
 
-TEST_F(DPrintFixture, TestDestPrintFloat32) {
+TEST_F(DPrintFixture, TensixTestDestPrintFloat32) {
     // Setup test configuration
     DestPrintTestConfig test_config = {
         .num_tiles = 2,
@@ -266,7 +264,7 @@ TEST_F(DPrintFixture, TestDestPrintFloat32) {
         this->devices_[0]);
 }
 
-TEST_F(DPrintFixture, TestDestPrintFloat32RemapAndSwizzle) {
+TEST_F(DPrintFixture, TensixTestDestPrintFloat32RemapAndSwizzle) {
     // Setup test configuration
     DestPrintTestConfig test_config = {
         .num_tiles = 3,
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tiles.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tiles.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp
index 6a0210a9ef6..33b85833157 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_print_tiles.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "dprint_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "common/bfloat8.hpp"
 #include "common/bfloat4.hpp"
 
@@ -111,8 +111,8 @@ static void RunTest(DPrintFixture* fixture, Device* device, tt::DataFormat data_
 
     // Create an input CB with the right data format
     uint32_t tile_size = detail::TileSize(data_format);
-    CircularBufferConfig cb_src0_config = CircularBufferConfig(tile_size, {{CB::c_in0, data_format}})
-                                              .set_page_size(CB::c_in0, tile_size);
+    CircularBufferConfig cb_src0_config = CircularBufferConfig(tile_size, {{CBIndex::c_0, data_format}})
+                                              .set_page_size(CBIndex::c_0, tile_size);
     CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
     // Dram buffer to send data to, device will read it out of here to print
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp
rename to tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp
index 0786c960813..05ae9069dec 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/dprint/test_raise_wait.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_raise_wait.cpp
@@ -2,9 +2,9 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "dprint_fixture.hpp"
+#include "debug_tools_fixture.hpp"
 #include "gtest/gtest.h"
-#include "test_utils.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -279,7 +279,7 @@ static void RunTest(DPrintFixture* fixture, Device* device) {
 }
 }
 
-TEST_F(DPrintFixture, TestPrintRaiseWait) {
+TEST_F(DPrintFixture, TensixTestPrintRaiseWait) {
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp
similarity index 91%
rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp
rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp
index 8f5ca5efc46..25fd8be5c26 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "watcher_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher asserts.
@@ -11,6 +11,7 @@
 using namespace tt;
 using namespace tt::tt_metal;
 
+namespace CMAKE_UNIQUE_NAMESPACE {
 static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_type) {
     // Set up program
     Program program = Program();
@@ -176,8 +177,10 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty
     log_info(LogTest, "Reported error: {}", exception);
     EXPECT_TRUE(expected == get_watcher_exception_message());
 }
+}
 
-TEST_F(WatcherFixture, TestWatcherAssertBrisc) {
+TEST_F(WatcherFixture, TensixTestWatcherAssertBrisc) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (this->slow_dispatch_)
         GTEST_SKIP();
 
@@ -188,7 +191,8 @@ TEST_F(WatcherFixture, TestWatcherAssertBrisc) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherAssertNCrisc) {
+TEST_F(WatcherFixture, TensixTestWatcherAssertNCrisc) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(
@@ -197,7 +201,8 @@ TEST_F(WatcherFixture, TestWatcherAssertNCrisc) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherAssertTrisc0) {
+TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc0) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(
@@ -206,7 +211,8 @@ TEST_F(WatcherFixture, TestWatcherAssertTrisc0) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherAssertTrisc1) {
+TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc1) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(
@@ -215,7 +221,8 @@ TEST_F(WatcherFixture, TestWatcherAssertTrisc1) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherAssertTrisc2) {
+TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc2) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(
@@ -224,7 +231,8 @@ TEST_F(WatcherFixture, TestWatcherAssertTrisc2) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherAssertErisc) {
+TEST_F(WatcherFixture, ActiveEthTestWatcherAssertErisc) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(
@@ -233,7 +241,8 @@ TEST_F(WatcherFixture, TestWatcherAssertErisc) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherAssertIErisc) {
+TEST_F(WatcherFixture, IdleEthTestWatcherAssertIErisc) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (!this->IsSlowDispatch()) {
         log_info(tt::LogTest, "FD-on-idle-eth not supported.");
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_link_training.cpp
similarity index 93%
rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp
rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_link_training.cpp
index dd23509745b..043bc8682fa 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_link_training.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_link_training.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "watcher_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher polling the eth link training counter.
@@ -15,7 +15,7 @@ using namespace tt::tt_metal;
 static void RunTest(WatcherFixture* fixture, Device* device) {
 }
 
-TEST_F(WatcherFixture, TestWatcherEthLinkCheck) {
+TEST_F(WatcherFixture, ActiveEthTestWatcherEthLinkCheck) {
     // Eth link retraining only supported on WH for now, this test is also dispatch-agnostic so just pick one.
     if (this->slow_dispatch_ || this->arch_ != tt::ARCH::WORMHOLE_B0 || this->devices_.size() == 1) {
         log_info(LogTest, "Test only runs on fast dispatch + multi-chip WH, skipping...");
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp
similarity index 96%
rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize.cpp
rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp
index 416ffece9bd..8f656da7fd6 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "watcher_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "llrt/llrt.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
@@ -225,7 +225,7 @@ void CheckHostSanitization(Device *device) {
     }
 }
 
-TEST_F(WatcherFixture, TestWatcherSanitize) {
+TEST_F(WatcherFixture, TensixTestWatcherSanitize) {
     // Skip this test for slow dipatch for now. Due to how llrt currently sits below device, it's
     // tricky to check watcher server status from the finish loop for slow dispatch. Once issue #4363
     // is resolved, we should add a check for print server handing in slow dispatch as well.
@@ -244,7 +244,7 @@ TEST_F(WatcherFixture, TestWatcherSanitize) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentL1) {
+TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1) {
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(
@@ -256,7 +256,7 @@ TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentL1) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAM) {
+TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentDRAM) {
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(
@@ -268,7 +268,7 @@ TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAM) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAMNCrisc) {
+TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentDRAMNCrisc) {
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(
@@ -280,13 +280,13 @@ TEST_F(WatcherFixture, TestWatcherSanitizeAlignmentDRAMNCrisc) {
     );
 }
 
-TEST_F(WatcherFixture, TestWatcherSanitizeEth) {
+TEST_F(WatcherFixture, ActiveEthTestWatcherSanitizeEth) {
     if (this->slow_dispatch_)
         GTEST_SKIP();
     this->RunTestOnDevice(RunTestEth, this->devices_[0]);
 }
 
-TEST_F(WatcherFixture, TestWatcherSanitizeIEth) {
+TEST_F(WatcherFixture, IdleEthTestWatcherSanitizeIEth) {
     if (!this->IsSlowDispatch()) {
         log_info(tt::LogTest, "FD-on-idle-eth not supported.");
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp
similarity index 96%
rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp
rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp
index 600872d58ac..3ddd22c58cc 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_noc_sanitize_delays.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp
@@ -3,8 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "llrt/rtoptions.hpp"
-#include "watcher_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "llrt/llrt.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
@@ -56,18 +56,18 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord
         auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates();
         auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();
 
-        uint32_t src0_cb_index = 0;
+        uint32_t src0_cb_index = tt::CBIndex::c_0;
         uint32_t num_input_tiles = 2;
         tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * SINGLE_TILE_SIZE, {{src0_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(src0_cb_index, SINGLE_TILE_SIZE);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        uint32_t src1_cb_index = 1;
+        uint32_t src1_cb_index = tt::CBIndex::c_1;
         tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * SINGLE_TILE_SIZE, {{src1_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(src1_cb_index, SINGLE_TILE_SIZE);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16;  // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 2;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * SINGLE_TILE_SIZE, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, SINGLE_TILE_SIZE);
@@ -154,7 +154,7 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord
         EXPECT_TRUE((read_vec[0] >> 24) == 0x3);
 }
 
-TEST_F(WatcherDelayFixture, TestWatcherSanitizeInsertDelays) {
+TEST_F(WatcherDelayFixture, TensixTestWatcherSanitizeInsertDelays) {
     if (this->slow_dispatch_)
         GTEST_SKIP();
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp
rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp
index f358a30ebad..fb70bc91700 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_pause.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "watcher_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking watcher pause feature.
@@ -134,7 +134,7 @@ static void RunTest(WatcherFixture* fixture, Device* device) {
 }
 }
 
-TEST_F(WatcherFixture, TestWatcherPause) {
+TEST_F(WatcherFixture, TensixTestWatcherPause) {
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(CMAKE_UNIQUE_NAMESPACE::RunTest, device);
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_ringbuf.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp
similarity index 88%
rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_ringbuf.cpp
rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp
index cc2727ef71d..97ed9adef75 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_ringbuf.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "watcher_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 
 //////////////////////////////////////////////////////////////////////////////////////////
 // A test for checking debug ring buffer feature.
@@ -20,6 +20,7 @@ std::vector<std::string> expected = {
     "]"
 };
 
+namespace CMAKE_UNIQUE_NAMESPACE {
 static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_type) {
     // Set up program
     Program program = Program();
@@ -141,8 +142,10 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty
         )
     );
 }
+}
 
-TEST_F(WatcherFixture, TestWatcherRingBufferBrisc) {
+TEST_F(WatcherFixture, TensixTestWatcherRingBufferBrisc) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(
             [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugBrisc);},
@@ -150,7 +153,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferBrisc) {
         );
     }
 }
-TEST_F(WatcherFixture, TestWatcherRingBufferNCrisc) {
+TEST_F(WatcherFixture, TensixTestWatcherRingBufferNCrisc) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(
             [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugNCrisc);},
@@ -158,7 +162,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferNCrisc) {
         );
     }
 }
-TEST_F(WatcherFixture, TestWatcherRingBufferTrisc0) {
+TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc0) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(
             [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugTrisc0);},
@@ -166,7 +171,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferTrisc0) {
         );
     }
 }
-TEST_F(WatcherFixture, TestWatcherRingBufferTrisc1) {
+TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc1) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(
             [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugTrisc1);},
@@ -174,7 +180,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferTrisc1) {
         );
     }
 }
-TEST_F(WatcherFixture, TestWatcherRingBufferTrisc2) {
+TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc2) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(
             [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugTrisc2);},
@@ -182,7 +189,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferTrisc2) {
         );
     }
 }
-TEST_F(WatcherFixture, TestWatcherRingBufferErisc) {
+TEST_F(WatcherFixture, ActiveEthTestWatcherRingBufferErisc) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     for (Device* device : this->devices_) {
         this->RunTestOnDevice(
             [](WatcherFixture *fixture, Device *device){RunTest(fixture, device, DebugErisc);},
@@ -190,7 +198,8 @@ TEST_F(WatcherFixture, TestWatcherRingBufferErisc) {
         );
     }
 }
-TEST_F(WatcherFixture, TestWatcherRingBufferIErisc) {
+TEST_F(WatcherFixture, IdleEthTestWatcherRingBufferIErisc) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     if (!this->IsSlowDispatch()) {
         log_info(tt::LogTest, "FD-on-idle-eth not supported.");
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp
similarity index 99%
rename from tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp
rename to tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp
index 8da13273c27..60a1ffc1dcd 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_waypoint.cpp
+++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp
@@ -2,8 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "watcher_fixture.hpp"
-#include "test_utils.hpp"
+#include "debug_tools_fixture.hpp"
+#include "debug_tools_test_utils.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 
diff --git a/tests/tt_metal/tt_metal/device/CMakeLists.txt b/tests/tt_metal/tt_metal/device/CMakeLists.txt
new file mode 100644
index 00000000000..d1b29149f67
--- /dev/null
+++ b/tests/tt_metal/tt_metal/device/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(UNIT_TESTS_DEVICE_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_device_cluster_api.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_device_init_and_teardown.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_device_pool.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_device.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_galaxy_cluster_api.cpp
+)
+
+add_executable(unit_tests_device ${UNIT_TESTS_DEVICE_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_device)
+
+target_link_libraries(unit_tests_device PUBLIC test_metal_common_libs)
+target_include_directories(
+    unit_tests_device
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+set_target_properties(
+    unit_tests_device
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
diff --git a/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp b/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp
new file mode 100644
index 00000000000..bca695fa95e
--- /dev/null
+++ b/tests/tt_metal/tt_metal/device/galaxy_fixture.hpp
@@ -0,0 +1,100 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "host_api.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/impl/device/device_pool.hpp"
+#include "multi_device_fixture.hpp"
+
+class GalaxyFixture : public MultiDeviceFixture {
+   protected:
+    void SkipTestSuiteIfNotGalaxyMotherboard()
+    {
+        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
+        if (!(this->arch_ == tt::ARCH::WORMHOLE_B0 && num_devices >= 32))
+        {
+            GTEST_SKIP();
+        }
+    }
+
+    void InitializeDevices()
+    {
+        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
+        std::vector<chip_id_t> ids;
+        for (uint32_t id = 0; id < num_devices; id++)
+        {
+            ids.push_back(id);
+        }
+        this->device_ids_to_devices_ = tt::tt_metal::detail::CreateDevices(ids);
+        this->devices_ = tt::DevicePool::instance().get_all_active_devices();
+    }
+
+    void SetUp() override
+    {
+        MultiDeviceFixture::SetUp();
+        this->DetectDispatchMode();
+        this->SkipTestSuiteIfNotGalaxyMotherboard();
+        this->InitializeDevices();
+    }
+
+    void TearDown() override
+    {
+        tt::tt_metal::detail::CloseDevices(this->device_ids_to_devices_);
+        this->device_ids_to_devices_.clear();
+        this->devices_.clear();
+    }
+
+   private:
+    std::map<chip_id_t, Device*> device_ids_to_devices_;
+};
+
+class TGFixture : public GalaxyFixture
+{
+   protected:
+    void SkipTestSuiteIfNotTG()
+    {
+        this->SkipTestSuiteIfNotGalaxyMotherboard();
+        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
+        const size_t num_pcie_devices = tt::tt_metal::GetNumPCIeDevices();
+        if (!(num_devices == 32 && num_pcie_devices == 4))
+        {
+            GTEST_SKIP();
+        }
+    }
+
+    void SetUp() override
+    {
+        MultiDeviceFixture::SetUp();
+        this->DetectDispatchMode();
+        this->SkipTestSuiteIfNotTG();
+        this->InitializeDevices();
+    }
+};
+
+class TGGFixture : public GalaxyFixture
+{
+   protected:
+    void SkipTestSuiteIfNotTGG()
+    {
+        this->SkipTestSuiteIfNotGalaxyMotherboard();
+        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
+        const size_t num_pcie_devices = tt::tt_metal::GetNumPCIeDevices();
+        if (!(num_devices == 64 && num_pcie_devices == 8))
+        {
+            GTEST_SKIP();
+        }
+    }
+
+    void SetUp() override
+    {
+        MultiDeviceFixture::SetUp();
+        this->DetectDispatchMode();
+        this->SkipTestSuiteIfNotTGG();
+        this->InitializeDevices();
+    }
+};
diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp b/tests/tt_metal/tt_metal/device/test_device.cpp
similarity index 85%
rename from tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
rename to tests/tt_metal/tt_metal/device/test_device.cpp
index 4dc272cfb24..1137a2edeb3 100644
--- a/tests/tt_metal/tt_metal/unit_tests/basic/device.cpp
+++ b/tests/tt_metal/tt_metal/device/test_device.cpp
@@ -4,16 +4,9 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp"
+#include "device_fixture.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt;
@@ -81,46 +74,6 @@ bool dram_ping(
 }
 }  // namespace unit_tests::basic::device
 
-TEST_F(BasicFixture, SingleDeviceHarvestingPrints) {
-    auto arch = tt::get_arch_from_string(get_umd_arch_name());
-    tt::tt_metal::Device* device;
-    const unsigned int device_id = 0;
-    device = tt::tt_metal::CreateDevice(device_id);
-    CoreCoord unharvested_logical_grid_size;
-    switch (arch) {
-        case tt::ARCH::GRAYSKULL: unharvested_logical_grid_size = CoreCoord(12, 10);  break;
-        case tt::ARCH::WORMHOLE_B0: unharvested_logical_grid_size = CoreCoord(8, 10); break;
-        case tt::ARCH::BLACKHOLE: unharvested_logical_grid_size = CoreCoord(14, 10); break;
-        default:
-            TT_THROW("Unsupported arch {}", get_umd_arch_name());
-    }
-    auto logical_grid_size = device->logical_grid_size();
-    if (logical_grid_size == unharvested_logical_grid_size) {
-        tt::log_info("Harvesting Disabled in SW");
-    } else {
-        tt::log_info("Harvesting Enabled in SW");
-        tt::log_info("Number of Harvested Rows={}", unharvested_logical_grid_size.y - logical_grid_size.y);
-    }
-
-    tt::log_info("Logical -- Noc Coordinates Mapping");
-    tt::log_info("[Logical <-> NOC0] Coordinates");
-    for (int r = 0; r < logical_grid_size.y; r++) {
-        string output_row = "";
-        for (int c = 0; c < logical_grid_size.x; c++) {
-            const CoreCoord logical_coord(c, r);
-            const auto noc_coord = device->worker_core_from_logical_core(logical_coord);
-            output_row += "{L[x" + std::to_string(c);
-            output_row += "-y" + std::to_string(r);
-            output_row += "]:N[x" + std::to_string(noc_coord.x);
-            output_row += "-y" + std::to_string(noc_coord.y);
-            output_row += "]}, ";
-        }
-        tt::log_info("{}", output_row);
-    }
-    ASSERT_TRUE(tt::tt_metal::CloseDevice(device));
-}
-
-
 TEST_F(DeviceFixture, PingAllLegalDramChannels) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         {
@@ -163,7 +116,7 @@ TEST_F(DeviceFixture, PingIllegalDramChannels) {
     }
 }
 
-TEST_F(DeviceFixture, PingAllLegalL1Cores) {
+TEST_F(DeviceFixture, TensixPingAllLegalL1Cores) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         {
             size_t start_byte_address = devices_.at(id)->get_base_allocator_addr(HalMemType::L1);
@@ -198,7 +151,7 @@ TEST_F(DeviceFixture, PingAllLegalL1Cores) {
     }
 }
 
-TEST_F(DeviceFixture, PingIllegalL1Cores) {
+TEST_F(DeviceFixture, TensixPingIllegalL1Cores) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         auto grid_size = devices_.at(id)->logical_grid_size();
         grid_size.x++;
@@ -215,7 +168,7 @@ TEST_F(DeviceFixture, PingIllegalL1Cores) {
 // 2. Launch a kernel to read and increment the value in each bank
 // 3. Host validates that the value from step 1 has been incremented
 // Purpose of this test is to ensure that L1 reader/writer APIs do not target harvested cores
-TEST_F(DeviceFixture, ValidateKernelDoesNotTargetHarvestedCores) {
+TEST_F(DeviceFixture, TensixValidateKernelDoesNotTargetHarvestedCores) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         uint32_t num_l1_banks = this->devices_.at(id)->num_banks(BufferType::L1);
         std::vector<uint32_t> host_input(1);
@@ -280,7 +233,7 @@ TEST_F(DeviceFixture, TestDeviceToHostMemChannelAssignment) {
 }
 
 // Test to ensure writing from 16B aligned L1 address to 16B aligned PCIe address works
-TEST_F(DeviceFixture, TestL1ToPCIeAt16BAlignedAddress) {
+TEST_F(DeviceFixture, TensixTestL1ToPCIeAt16BAlignedAddress) {
     tt_metal::Program program = tt_metal::CreateProgram();
     Device *device = this->devices_.at(0);
     EXPECT_TRUE(device->is_mmio_capable());
diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/device_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp
similarity index 93%
rename from tests/tt_metal/tt_metal/unit_tests/ethernet/device_cluster_api.cpp
rename to tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp
index 312b65a63d9..b846ead88e3 100644
--- a/tests/tt_metal/tt_metal/unit_tests/ethernet/device_cluster_api.cpp
+++ b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp
@@ -5,15 +5,10 @@
 #include <gtest/gtest.h>
 
 #include <algorithm>
-#include <functional>
-#include <random>
 
-#include "n300_device_fixture.hpp"
+#include "multi_device_fixture.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt;
 using namespace tt::test_utils;
@@ -23,7 +18,7 @@ namespace unit_tests::multichip::cluster {
 // Run this on Nebula X2 only, validate etherent core apis are correct
 // Known connectivity: chip 0 (x=9, y=6) <--> chip 1 (x=9, y=0)
 //                     chip 0 (x=1, y=6) <--> chip 1 (x=1, y=0)
-TEST_F(N300DeviceFixture, ValidateEthernetConnectivity) {
+TEST_F(N300DeviceFixture, EthValidateEthernetConnectivity) {
     const auto& device_0 = this->devices_.at(0);
     const auto& device_1 = this->devices_.at(1);
 
@@ -79,13 +74,13 @@ TEST_F(N300DeviceFixture, ValidateEthernetConnectivity) {
     ASSERT_TRUE(chip_1_eth_noc_coords_returned == chip_1_eth_noc_coords_expected);
 }
 
-TEST_F(N300DeviceFixture, InvalidLogicalEthernetCore) {
+TEST_F(N300DeviceFixture, EthInvalidLogicalEthernetCore) {
     const auto& device_0 = this->devices_.at(0);
     EXPECT_ANY_THROW(device_0->ethernet_core_from_logical_core(CoreCoord(1, 0)));
     EXPECT_ANY_THROW(device_0->ethernet_core_from_logical_core(CoreCoord(0, 16)));
 }
 
-TEST_F(N300DeviceFixture, ValidateAllEthernetCoreMapping) {
+TEST_F(N300DeviceFixture, EthValidateAllEthernetCoreMapping) {
     static std::map<CoreCoord, CoreCoord> expected_mapping_logical_to_physical = {
         {CoreCoord(0, 0), CoreCoord(9, 0)},
         {CoreCoord(0, 1), CoreCoord(1, 0)},
@@ -112,7 +107,7 @@ TEST_F(N300DeviceFixture, ValidateAllEthernetCoreMapping) {
     }
 }
 
-TEST_F(N300DeviceFixture, ValidatePhysicalCoreConversion) {
+TEST_F(N300DeviceFixture, EthValidatePhysicalCoreConversion) {
     static std::map<CoreCoord, CoreCoord> expected_mapping_logical_to_physical = {
         {CoreCoord(0, 0), CoreCoord(9, 0)},
         {CoreCoord(0, 1), CoreCoord(1, 0)},
@@ -141,7 +136,7 @@ TEST_F(N300DeviceFixture, ValidatePhysicalCoreConversion) {
     EXPECT_ANY_THROW(device_0->physical_core_from_logical_core(CoreCoord(0, 0), CoreType::PCIE));
 }
 
-TEST_F(N300DeviceFixture, ValidateEthernetSockets) {
+TEST_F(N300DeviceFixture, ActiveEthValidateEthernetSockets) {
     const auto& device_0 = this->devices_.at(0);
     const auto& device_1 = this->devices_.at(1);
 
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp
similarity index 93%
rename from tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp
rename to tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp
index f4dfae4d653..44974e34b10 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_device_init.cpp
+++ b/tests/tt_metal/tt_metal/device/test_device_init_and_teardown.cpp
@@ -4,16 +4,10 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
 #include "tt_metal/impl/device/device.hpp"
 #include "tt_metal/impl/device/device_pool.hpp"
 
@@ -61,7 +55,6 @@ bool load_all_blank_kernels(tt_metal::Device *device) {
     CreateKernel(program, "tt_metal/kernels/compute/blank.cpp", all_cores, ComputeConfig{});
 
     unit_tests_common::basic::test_device_init::launch_program(device, program);
-    // tt_metal::detail::LaunchProgram(device, program);
     return pass;
 }
 }  // namespace unit_tests_common::basic::test_device_init
@@ -92,7 +85,7 @@ TEST_P(DeviceParamFixture, DeviceInitializeAndTeardown) {
     }
 }
 
-TEST_P(DeviceParamFixture, DeviceLoadBlankKernels) {
+TEST_P(DeviceParamFixture, TensixDeviceLoadBlankKernels) {
     unsigned int num_devices = GetParam();
     unsigned int num_pci_devices = tt::tt_metal::GetNumPCIeDevices();
     if ((arch == tt::ARCH::GRAYSKULL && num_devices > 1) || (num_devices > num_pci_devices)) {
diff --git a/tests/tt_metal/tt_metal/device/test_device_pool.cpp b/tests/tt_metal/tt_metal/device/test_device_pool.cpp
new file mode 100644
index 00000000000..b1b5cc94822
--- /dev/null
+++ b/tests/tt_metal/tt_metal/device/test_device_pool.cpp
@@ -0,0 +1,131 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include "host_api.hpp"
+#include "impl/device/device_pool.hpp"
+
+using namespace tt;
+
+TEST(DevicePool, DevicePoolOpenClose) {
+    std::vector<chip_id_t> device_ids{0};
+    int num_hw_cqs = 1;
+    int l1_small_size = 1024;
+    const auto& dispatch_core_type = llrt::OptionsG.get_dispatch_core_type();
+    DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+    auto devices = DevicePool::instance().get_all_active_devices();
+    for (const auto& dev : devices) {
+        ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
+        ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
+        ASSERT_TRUE(dev->is_initialized());
+    }
+
+    // Close then get devices again
+    for (const auto& dev : devices) {
+        dev->close();
+    }
+    devices = DevicePool::instance().get_all_active_devices();
+    for (const auto& dev : devices) {
+        ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
+        ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
+        ASSERT_TRUE(dev->is_initialized());
+    }
+    for (const auto& dev : devices) {
+        dev->close();
+    }
+}
+
+TEST(DevicePool, DevicePoolReconfigDevices) {
+    std::vector<chip_id_t> device_ids{0};
+    int num_hw_cqs = 1;
+    int l1_small_size = 1024;
+    const auto& dispatch_core_type = llrt::OptionsG.get_dispatch_core_type();
+    DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+    auto devices = DevicePool::instance().get_all_active_devices();
+    for (const auto& dev : devices) {
+        ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
+        ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
+        ASSERT_TRUE(dev->is_initialized());
+    }
+
+    // Close then get devices with different configs
+    for (const auto& dev : devices) {
+        dev->close();
+    }
+    l1_small_size = 2048;
+    DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+    devices = DevicePool::instance().get_all_active_devices();
+    for (const auto& dev : devices) {
+        ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
+        ASSERT_TRUE(dev->is_initialized());
+    }
+    for (const auto& dev : devices) {
+        dev->close();
+    }
+}
+
+TEST(DevicePool, DevicePoolAddDevices) {
+    if (tt_metal::GetNumAvailableDevices() != 8) {
+        GTEST_SKIP();
+    }
+    std::vector<chip_id_t> device_ids{0};
+    int num_hw_cqs = 1;
+    int l1_small_size = 1024;
+    const auto& dispatch_core_type = llrt::OptionsG.get_dispatch_core_type();
+    DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+    auto devices = DevicePool::instance().get_all_active_devices();
+    for (const auto& dev : devices) {
+        ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
+        ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
+        ASSERT_TRUE(dev->is_initialized());
+    }
+
+    // Close then get more devices
+    for (const auto& dev : devices) {
+        dev->close();
+    }
+    device_ids = {0, 1, 2, 3};
+    DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+    devices = DevicePool::instance().get_all_active_devices();
+    ASSERT_TRUE(devices.size() >= 4);
+    for (const auto& dev : devices) {
+        ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
+        ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
+        ASSERT_TRUE(dev->is_initialized());
+    }
+    for (const auto& dev : devices) {
+        dev->close();
+    }
+}
+
+TEST(DevicePool, DevicePoolReduceDevices) {
+    if (tt_metal::GetNumAvailableDevices() != 8) {
+        GTEST_SKIP();
+    }
+    std::vector<chip_id_t> device_ids{0, 1, 2, 3};
+    int num_hw_cqs = 1;
+    int l1_small_size = 1024;
+    const auto& dispatch_core_type = llrt::OptionsG.get_dispatch_core_type();
+    DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+    const auto devices = DevicePool::instance().get_all_active_devices();
+    for (const auto& dev : devices) {
+        ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
+        ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
+        ASSERT_TRUE(dev->is_initialized());
+    }
+
+    // Close then get less devices
+    for (const auto& dev : devices) {
+        dev->close();
+    }
+    device_ids = {0};
+    DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+    auto dev = DevicePool::instance().get_active_device(0);
+    ASSERT_TRUE(dev->id() == 0);
+    ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
+    ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
+    ASSERT_TRUE(dev->is_initialized());
+    DevicePool::instance().close_device(0);
+}
diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/galaxy_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/ethernet/galaxy_cluster_api.cpp
rename to tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp
index 447758371e8..27a551cce0f 100644
--- a/tests/tt_metal/tt_metal/unit_tests/ethernet/galaxy_cluster_api.cpp
+++ b/tests/tt_metal/tt_metal/device/test_galaxy_cluster_api.cpp
@@ -4,11 +4,7 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "device_fixture.hpp"
+#include "galaxy_fixture.hpp"
 #include "tt_metal/llrt/tt_cluster.hpp"
 #include "tt_metal/host_api.hpp"
 
@@ -51,7 +47,7 @@ std::unordered_set<chip_id_t> get_ethernet_connected_device_ids(const chip_id_t
 // shelves and 4 links between adjacent Galaxy chips that are on the same
 // shelf, and currently tt::Cluster does not expose a way of determining
 // which shelf a particular Galaxy chip is on.
-TEST_F(TGFixture, ValidateNumLinksBetweenAdjacentGalaxyChips) {
+TEST_F(TGFixture, ActiveEthValidateNumLinksBetweenAdjacentGalaxyChips) {
     for (Device* device : this->devices_)
     {
         const chip_id_t device_id = device->id();
@@ -85,7 +81,7 @@ TEST_F(TGFixture, ValidateNumLinksBetweenAdjacentGalaxyChips) {
 
 // Validate that each MMIO chip links to two separate Galaxy chips,
 // and that each Galaxy chip links to at most one MMIO chip
-TEST_F(GalaxyFixture, ValidateLinksBetweenMMIOAndGalaxyChips) {
+TEST_F(GalaxyFixture, ActiveEthValidateLinksBetweenMMIOAndGalaxyChips) {
     for (Device* device : this->devices_)
     {
         const chip_id_t device_id = device->id();
diff --git a/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt
new file mode 100644
index 00000000000..8ab4924c4f8
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/CMakeLists.txt
@@ -0,0 +1,33 @@
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_buffer)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_event)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_program)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dispatch_trace)
+
+add_executable(
+    unit_tests_dispatch
+    $<TARGET_OBJECTS:unit_tests_dispatch_buffer_o>
+    $<TARGET_OBJECTS:unit_tests_dispatch_event_o>
+    $<TARGET_OBJECTS:unit_tests_dispatch_program_o>
+    $<TARGET_OBJECTS:unit_tests_dispatch_trace_o>
+)
+TT_ENABLE_UNITY_BUILD(unit_tests_dispatch)
+
+target_link_libraries(unit_tests_dispatch PUBLIC test_metal_common_libs)
+target_include_directories(
+    unit_tests_dispatch
+    PRIVATE
+        ${UMD_HOME}
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+set_target_properties(
+    unit_tests_dispatch
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt
new file mode 100644
index 00000000000..710e490c74a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(UNIT_TESTS_DISPATCH_BUFFER_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp
+)
+
+add_library(unit_tests_dispatch_buffer_o STATIC ${UNIT_TESTS_DISPATCH_BUFFER_SRC})
+
+target_link_libraries(unit_tests_dispatch_buffer_o PRIVATE test_metal_common_libs)
+
+target_include_directories(
+    unit_tests_dispatch_buffer_o
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+
+add_executable(unit_tests_dispatch_buffer $<TARGET_OBJECTS:unit_tests_dispatch_buffer_o>)
+
+target_link_libraries(unit_tests_dispatch_buffer PRIVATE test_metal_common_libs)
+
+set_target_properties(
+    unit_tests_dispatch_buffer
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
+
+TT_ENABLE_UNITY_BUILD(unit_tests_dispatch_buffer)
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
similarity index 63%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
rename to tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index 7cab67ff6d6..a453ca0074e 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -5,12 +5,11 @@
 #include <memory>
 
 #include "command_queue_fixture.hpp"
-#include "command_queue_test_utils.hpp"
+#include "multi_command_queue_fixture.hpp"
+#include "dispatch_test_utils.hpp"
 #include "gtest/gtest.h"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
 using std::vector;
@@ -323,12 +322,91 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(
     return pass;
 }
 
+bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, vector<std::reference_wrapper<CommandQueue>>& cqs, const TestBufferConfig& config) {
+    bool pass = true;
+    for (const bool use_void_star_api: {true, false}) {
+
+        size_t buf_size = config.num_pages * config.page_size;
+        std::vector<std::shared_ptr<Buffer>> buffers;
+        std::vector<std::vector<uint32_t>> srcs;
+        for (uint i = 0; i < cqs.size(); i++) {
+            buffers.push_back(Buffer::create(device, buf_size, config.page_size, config.buftype));
+            srcs.push_back(generate_arange_vector(buffers[i]->size()));
+            if (use_void_star_api) {
+                EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i].data(), false);
+            } else {
+                EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false);
+            }
+        }
+
+        for (uint i = 0; i < cqs.size(); i++) {
+            std::vector<uint32_t> result;
+            if (use_void_star_api) {
+                result.resize(buf_size / sizeof(uint32_t));
+                EnqueueReadBuffer(cqs[i], *buffers[i], result.data(), true);
+            } else {
+                EnqueueReadBuffer(cqs[i], *buffers[i], result, true);
+            }
+            bool local_pass = (srcs[i] == result);
+            pass &= local_pass;
+        }
+    }
+
+    return pass;
+}
+
 }  // end namespace local_test_functions
 
 namespace basic_tests {
 namespace dram_tests {
 
-TEST_F(CommandQueueSingleCardFixture, WriteOneTileToDramBank0) {
+TEST_F(CommandQueueBufferFixture, DISABLED_TestAsyncBufferRW) {
+    // Test Async Enqueue Read and Write + Get Addr + Buffer Allocation and Deallocation
+    auto &command_queue = this->device_->command_queue();
+    auto current_mode = CommandQueue::default_mode();
+    command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC);
+    Program program;
+    for (int j = 0; j < 10; j++) {
+        // Asynchronously initialize a buffer on device
+        uint32_t first_buf_value = j + 1;
+        uint32_t second_buf_value = j + 2;
+        uint32_t first_buf_size = 4096;
+        uint32_t second_buf_size = 2048;
+        // Asynchronously allocate buffer on device
+        std::shared_ptr<Buffer> buffer =
+            Buffer::create(this->device_, first_buf_size, first_buf_size, BufferType::DRAM);
+        std::shared_ptr<uint32_t> allocated_buffer_address = std::make_shared<uint32_t>();
+        EnqueueGetBufferAddr(this->device_->command_queue(), allocated_buffer_address.get(), buffer.get(), true);
+        // Ensure returned addr is correct
+        EXPECT_EQ((*allocated_buffer_address), buffer->address());
+
+        std::shared_ptr<std::vector<uint32_t>> vec =
+            std::make_shared<std::vector<uint32_t>>(first_buf_size / 4, first_buf_value);
+        std::vector<uint32_t> readback_vec = {};
+        // Write first vector to existing on device buffer.
+        EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false);
+        // Reallocate the vector in the main thread after asynchronously pushing it (ensure that worker still has access
+        // to this data)
+        vec = std::make_shared<std::vector<uint32_t>>(second_buf_size / 4, second_buf_value);
+        // Simulate what tt-eager does: Share buffer ownership with program
+        AssignGlobalBufferToProgram(buffer, program);
+        // Reallocate buffer (this is safe, since the program also owns the existing buffer, which will not be
+        // deallocated)
+        buffer = Buffer::create(this->device_, second_buf_size, second_buf_size, BufferType::DRAM);
+        // Write second vector to second buffer
+        EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false);
+        // Have main thread give up ownership immediately after writing
+        vec.reset();
+        // Read both buffer and ensure data is correct
+        EnqueueReadBuffer(this->device_->command_queue(), buffer, readback_vec, true);
+        for (int i = 0; i < readback_vec.size(); i++) {
+            EXPECT_EQ(readback_vec[i], second_buf_value);
+        }
+    }
+    command_queue.set_mode(current_mode);
+}
+
+TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToDramBank0) {
     TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
     for (Device *device : devices_) {
         tt::log_info("Running On Device {}", device->id());
@@ -336,7 +414,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileToDramBank0) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllDramBanks) {
+TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToAllDramBanks) {
     for (Device *device : devices_) {
         TestBufferConfig config = {
             .num_pages = uint32_t(device->num_banks(BufferType::DRAM)), .page_size = 2048, .buftype = BufferType::DRAM};
@@ -345,7 +423,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllDramBanks) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
+TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
     constexpr uint32_t num_round_robins = 2;
     for (Device *device : devices_) {
         TestBufferConfig config = {
@@ -356,7 +434,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRo
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, Sending131072Pages) {
+TEST_F(CommandQueueSingleCardBufferFixture, Sending131072Pages) {
     for (Device *device : devices_) {
         TestBufferConfig config = {.num_pages = 131072, .page_size = 128, .buftype = BufferType::DRAM};
         tt::log_info("Running On Device {}", device->id());
@@ -364,7 +442,7 @@ TEST_F(CommandQueueSingleCardFixture, Sending131072Pages) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanAndUnalignedToTransferPage) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanAndUnalignedToTransferPage) {
     constexpr uint32_t num_round_robins = 2;
     for (Device *device : devices_) {
         TestBufferConfig config = {
@@ -376,7 +454,7 @@ TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanAndUnalignedToTransferPa
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanMaxPrefetchCommandSize) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestPageLargerThanMaxPrefetchCommandSize) {
     constexpr uint32_t num_round_robins = 1;
     for (Device *device : devices_) {
         CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
@@ -390,7 +468,7 @@ TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanMaxPrefetchCommandSize)
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestUnalignedPageLargerThanMaxPrefetchCommandSize) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestUnalignedPageLargerThanMaxPrefetchCommandSize) {
     constexpr uint32_t num_round_robins = 1;
     for (Device *device : devices_) {
         CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
@@ -405,7 +483,7 @@ TEST_F(CommandQueueSingleCardFixture, TestUnalignedPageLargerThanMaxPrefetchComm
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForDram) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestNon32BAlignedPageSizeForDram) {
     TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM};
 
     for (Device *device : devices_) {
@@ -413,7 +491,7 @@ TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForDram) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForDram2) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestNon32BAlignedPageSizeForDram2) {
     // From stable diffusion read buffer
     TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM};
 
@@ -422,16 +500,18 @@ TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForDram2) {
     }
 }
 
-TEST_F(CommandQueueFixture, TestPageSizeTooLarge) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestPageSizeTooLarge) {
     // Should throw a host error due to the page size not fitting in the consumer CB
     TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM};
 
-    EXPECT_ANY_THROW((local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(
-        this->device_, this->device_->command_queue(), config)));
+    for (Device *device : devices_) {
+        EXPECT_ANY_THROW((local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(
+            device, device->command_queue(), config)));
+    }
 }
 
 // Requires enqueue write buffer
-TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
     for (Device *device : this->devices_) {
         tt::log_info("Running On Device {}", device->id());
         uint32_t page_size = 2048;
@@ -449,7 +529,7 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapHostHugepageOnEnqueueReadBuffer) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
     for (Device *device : this->devices_) {
         tt::log_info("Running On Device {}", device->id());
         uint32_t page_size = 2048;
@@ -464,7 +544,7 @@ TEST_F(CommandQueueSingleCardFixture, TestIssueMultipleReadWriteCommandsForOneBu
 }
 
 // Test that command queue wraps when buffer available space in completion region is less than a page
-TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestWrapCompletionQOnInsufficientSpace) {
     uint32_t large_page_size = 8192;  // page size for first and third read
     uint32_t small_page_size = 2048;  // page size for second read
 
@@ -503,7 +583,7 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace) {
 
 // Test that command queue wraps when buffer read needs to be split into multiple enqueue_read_buffer commands and
 // available space in completion region is less than a page
-TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestWrapCompletionQOnInsufficientSpace2) {
     // Using default 75-25 issue and completion queue split
     for (Device *device : devices_) {
         tt::log_info("Running On Device {}", device->id());
@@ -536,18 +616,211 @@ TEST_F(CommandQueueSingleCardFixture, TestWrapCompletionQOnInsufficientSpace2) {
 
 // TODO: add test for wrapping with non aligned page sizes
 
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToDramBank0) {
+    TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToAllDramBanks) {
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        TestBufferConfig config = {
+            .num_pages = uint32_t(device->num_banks(BufferType::DRAM)),
+            .page_size = 2048,
+            .buftype = BufferType::DRAM};
+
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
+    constexpr uint32_t num_round_robins = 2;
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        TestBufferConfig config = {
+            .num_pages = num_round_robins * (device->num_banks(BufferType::DRAM)),
+            .page_size = 2048,
+            .buftype = BufferType::DRAM};
+
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, Sending131072Pages) {
+    // Was a failing case where we used to accidentally program cb num pages to be total
+    // pages instead of cb num pages.
+    TestBufferConfig config = {
+        .num_pages = 131072,
+        .page_size = 128,
+        .buftype = BufferType::DRAM};
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, TestNon32BAlignedPageSizeForDram) {
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM};
+
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, TestNon32BAlignedPageSizeForDram2) {
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        // From stable diffusion read buffer
+        TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM};
+
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        uint32_t page_size = 2048;
+        uint32_t command_queue_size = device->sysmem_manager().get_cq_size();
+        uint32_t num_pages = command_queue_size / page_size;
+
+        TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};
+
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToDramBank0) {
+    TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToAllDramBanks) {
+    TestBufferConfig config = {
+        .num_pages = uint32_t(this->device_->num_banks(BufferType::DRAM)),
+        .page_size = 2048,
+        .buftype = BufferType::DRAM};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
+    constexpr uint32_t num_round_robins = 2;
+    TestBufferConfig config = {
+        .num_pages = num_round_robins * (this->device_->num_banks(BufferType::DRAM)),
+        .page_size = 2048,
+        .buftype = BufferType::DRAM};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, Sending131072Pages) {
+    // Was a failing case where we used to accidentally program cb num pages to be total
+    // pages instead of cb num pages.
+    TestBufferConfig config = {
+        .num_pages = 131072,
+        .page_size = 128,
+        .buftype = BufferType::DRAM};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestNon32BAlignedPageSizeForDram) {
+    TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestNon32BAlignedPageSizeForDram2) {
+    // From stable diffusion read buffer
+    TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestPageSizeTooLarge) {
+    if (this->arch_ == tt::ARCH::WORMHOLE_B0) {
+        GTEST_SKIP(); // This test hanging on wormhole b0
+    }
+    // Should throw a host error due to the page size not fitting in the consumer CB
+    TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
+    uint32_t page_size = 2048;
+    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id());
+    uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel);
+    uint32_t num_pages = command_queue_size / page_size;
+
+    TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
 }  // end namespace dram_tests
 
 namespace l1_tests {
 
-TEST_F(CommandQueueSingleCardFixture, WriteOneTileToL1Bank0) {
+TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToL1Bank0) {
     TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1};
     for (Device *device : devices_) {
         local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer(device, device->command_queue(), config);
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllL1Banks) {
+TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToAllL1Banks) {
     for (Device *device : devices_) {
         auto compute_with_storage_grid = device->compute_with_storage_grid_size();
         TestBufferConfig config = {
@@ -559,7 +832,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllL1Banks) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
+TEST_F(CommandQueueSingleCardBufferFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
     for (Device *device : devices_) {
         auto compute_with_storage_grid = device->compute_with_storage_grid_size();
         TestBufferConfig config = {
@@ -571,7 +844,7 @@ TEST_F(CommandQueueSingleCardFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForL1) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestNon32BAlignedPageSizeForL1) {
     TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1};
 
     for (Device *device : devices_) {
@@ -582,7 +855,7 @@ TEST_F(CommandQueueSingleCardFixture, TestNon32BAlignedPageSizeForL1) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestBackToBackNon32BAlignedPageSize) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestBackToBackNon32BAlignedPageSize) {
     constexpr BufferType buff_type = BufferType::L1;
 
     for (Device *device : devices_) {
@@ -606,7 +879,7 @@ TEST_F(CommandQueueSingleCardFixture, TestBackToBackNon32BAlignedPageSize) {
 }
 
 // This case was failing for FD v1.3 design
-TEST_F(CommandQueueSingleCardFixture, TestLargeBuffer4096BPageSize) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestLargeBuffer4096BPageSize) {
     constexpr BufferType buff_type = BufferType::L1;
 
     for (Device *device : devices_) {
@@ -616,9 +889,107 @@ TEST_F(CommandQueueSingleCardFixture, TestLargeBuffer4096BPageSize) {
     }
 }
 
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToL1Bank0) {
+    TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1};
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToAllL1Banks) {
+    auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size();
+    TestBufferConfig config = {
+        .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
+        .page_size = 2048,
+        .buftype = BufferType::L1};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
+    auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size();
+    TestBufferConfig config = {
+        .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
+        .page_size = 2048,
+        .buftype = BufferType::L1};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueSingleDeviceBufferFixture, TestNon32BAlignedPageSizeForL1) {
+    TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1};
+
+    CommandQueue& a = this->device_->command_queue(0);
+    CommandQueue& b = this->device_->command_queue(1);
+    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToL1Bank0) {
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1};
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToAllL1Banks) {
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        auto compute_with_storage_grid = device->compute_with_storage_grid_size();
+        TestBufferConfig config = {
+            .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
+            .page_size = 2048,
+            .buftype = BufferType::L1};
+
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        auto compute_with_storage_grid = device->compute_with_storage_grid_size();
+        TestBufferConfig config = {
+            .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
+            .page_size = 2048,
+            .buftype = BufferType::L1};
+
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
+TEST_F(MultiCommandQueueMultiDeviceBufferFixture, TestNon32BAlignedPageSizeForL1) {
+    for (Device *device : devices_) {
+        tt::log_info("Running On Device {}", device->id());
+        TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1};
+
+        CommandQueue& a = device->command_queue(0);
+        CommandQueue& b = device->command_queue(1);
+        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
+        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
+    }
+}
+
 }  // end namespace l1_tests
 
-TEST_F(CommandQueueSingleCardFixture, TestNonblockingReads) {
+TEST_F(CommandQueueSingleCardBufferFixture, TestNonblockingReads) {
     constexpr BufferType buff_type = BufferType::L1;
 
     for (auto device : devices_) {
@@ -648,7 +1019,7 @@ namespace stress_tests {
 
 // TODO: Add stress test that vary page size
 
-TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsBlocking) {
+TEST_F(CommandQueueSingleCardBufferFixture, WritesToRandomBufferTypeAndThenReadsBlocking) {
     BufferStressTestConfig config = {
         .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16};
 
@@ -659,7 +1030,7 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsBlocki
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsNonblocking) {
+TEST_F(CommandQueueSingleCardBufferFixture, WritesToRandomBufferTypeAndThenReadsNonblocking) {
     BufferStressTestConfig config = {
         .seed = 0, .num_pages_total = 50000, .page_size = 2048, .max_num_pages_per_buffer = 16};
 
@@ -672,7 +1043,7 @@ TEST_F(CommandQueueSingleCardFixture, WritesToRandomBufferTypeAndThenReadsNonblo
 }
 
 // TODO: Split this into separate tests
-TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
+TEST_F(CommandQueueSingleCardBufferFixture, ShardedBufferL1ReadWrites) {
     std::map<std::string, std::vector<std::array<uint32_t, 2>>> test_params;
 
     for (Device *device : devices_) {
@@ -726,7 +1097,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferL1ReadWrites) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, ShardedBufferDRAMReadWrites) {
+TEST_F(CommandQueueSingleCardBufferFixture, ShardedBufferDRAMReadWrites) {
     for (Device *device : devices_) {
         for (const std::array<uint32_t, 2> cores :
              {std::array<uint32_t, 2>{1, 1},
@@ -784,7 +1155,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferDRAMReadWrites) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeL1ReadWrites) {
+TEST_F(CommandQueueSingleCardBufferFixture, ShardedBufferLargeL1ReadWrites) {
     for (Device *device : devices_) {
         for (const std::array<uint32_t, 2> cores :
              {std::array<uint32_t, 2>{1, 1},
@@ -826,7 +1197,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeL1ReadWrites) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeDRAMReadWrites) {
+TEST_F(CommandQueueSingleCardBufferFixture, ShardedBufferLargeDRAMReadWrites) {
     for (Device *device : devices_) {
         for (const std::array<uint32_t, 2> cores :
              {std::array<uint32_t, 2>{1, 1},
@@ -880,7 +1251,7 @@ TEST_F(CommandQueueSingleCardFixture, ShardedBufferLargeDRAMReadWrites) {
     }
 }
 
-TEST_F(CommandQueueFixture, StressWrapTest) {
+TEST_F(CommandQueueSingleCardBufferFixture, StressWrapTest) {
     const char *arch = getenv("ARCH_NAME");
     if (strcasecmp(arch, "wormhole_b0") == 0) {
         tt::log_info("cannot run this test on WH B0");
@@ -890,8 +1261,10 @@ TEST_F(CommandQueueFixture, StressWrapTest) {
 
     BufferStressTestConfig config = {
         .page_size = 4096, .max_num_pages_per_buffer = 2000, .num_iterations = 10000, .num_unique_vectors = 20};
-    EXPECT_TRUE(local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(
-        this->device_, this->device_->command_queue(), config));
+    for (Device *device : devices_) {
+        EXPECT_TRUE(local_test_functions::stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(
+            device, device->command_queue(), config));
+    }
 }
 
 }  // end namespace stress_tests
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
new file mode 100644
index 00000000000..59a45d5e490
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
@@ -0,0 +1,108 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstddef>
+#include <cstdint>
+#include <array>
+#include <tuple>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "tt_metal/common/core_coord.hpp"
+#include "tt_metal/impl/buffers/global_semaphore.hpp"
+#include "tt_metal/impl/device/device.hpp"
+#include "tt_metal/impl/event/event.hpp"
+#include "tt_metal/impl/sub_device/sub_device.hpp"
+#include "tt_metal/test_utils/stimulus.hpp"
+#include "command_queue_fixture.hpp"
+
+TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
+    uint32_t local_l1_size = 3200;
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
+    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
+    CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2});
+    CoreRangeSet sharded_cores_2 = CoreRange({4, 4}, {4, 4});
+
+    auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true);
+    auto sharded_cores_2_vec = corerange_to_cores(sharded_cores_2, std::nullopt, true);
+
+    ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1});
+    uint32_t page_size_1 = 32;
+    ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1};
+    auto input_1 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_1.size / sizeof(uint32_t));
+
+    ShardSpecBuffer shard_spec_buffer_2 = ShardSpecBuffer(sharded_cores_2, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_2.num_cores(), 1});
+    uint32_t page_size_2 = 64;
+    ShardedBufferConfig shard_config_2 = {nullptr, sharded_cores_2.num_cores() * page_size_2, page_size_2, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_2};
+    auto input_2 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_2.size / sizeof(uint32_t));
+
+    uint32_t page_size_3 = 1024;
+    InterleavedBufferConfig interleaved_config = {nullptr, page_size_3, page_size_3, BufferType::L1, TensorMemoryLayout::INTERLEAVED};
+    auto input_3 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, interleaved_config.size / sizeof(uint32_t));
+
+    for (Device *device : devices_) {
+        auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1}, local_l1_size);
+        auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size);
+        DeviceAddr l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
+        DeviceAddr max_addr = l1_unreserved_base + local_l1_size;
+
+        shard_config_1.device = device;
+        shard_config_2.device = device;
+        interleaved_config.device = device;
+
+        std::vector<CoreCoord> physical_cores_1;
+        physical_cores_1.reserve(sharded_cores_1_vec.size());
+        for (const auto& core : sharded_cores_1_vec) {
+            physical_cores_1.push_back(device->worker_core_from_logical_core(core));
+        }
+
+        std::vector<CoreCoord> physical_cores_2;
+        physical_cores_2.reserve(sharded_cores_2_vec.size());
+        for (const auto& core : sharded_cores_2_vec) {
+            physical_cores_2.push_back(device->worker_core_from_logical_core(core));
+        }
+
+        device->load_sub_device_manager(sub_device_manager_1);
+
+        auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
+        EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size());
+        EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
+        std::vector<uint32_t> output_1;
+        EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
+        EXPECT_EQ(input_1, output_1);
+        auto input_1_it = input_1.begin();
+        for (const auto& physical_core : physical_cores_1) {
+            auto readback = tt::llrt::read_hex_vec_from_core(
+                device->id(), physical_core, buffer_1->address(), page_size_1);
+            EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin()));
+            input_1_it += page_size_1 / sizeof(uint32_t);
+        }
+
+        auto buffer_2 = CreateBuffer(interleaved_config);
+        EXPECT_THROW(CreateBuffer(shard_config_1, SubDeviceId{1}), std::exception);
+        EXPECT_THROW(device->clear_loaded_sub_device_manager(), std::exception);
+        EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception);
+        DeallocateBuffer(*buffer_1);
+        device->clear_loaded_sub_device_manager();
+        device->load_sub_device_manager(sub_device_manager_2);
+
+        auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
+        EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size());
+        EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
+        std::vector<uint32_t> output_2;
+        EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
+        EXPECT_EQ(input_2, output_2);
+        auto input_2_it = input_2.begin();
+        for (const auto& physical_core : physical_cores_2) {
+            auto readback = tt::llrt::read_hex_vec_from_core(
+                device->id(), physical_core, buffer_3->address(), page_size_2);
+            EXPECT_TRUE(std::equal(input_2_it, input_2_it + page_size_2 / sizeof(uint32_t), readback.begin()));
+            input_2_it += page_size_2 / sizeof(uint32_t);
+        }
+
+        auto buffer_4 = CreateBuffer(shard_config_1,  SubDeviceId{0});
+        EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size());
+        EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt
new file mode 100644
index 00000000000..0db070a4ba0
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(UNIT_TESTS_DISPATCH_EVENT_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueWaitForEvent.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_events.cpp
+)
+
+add_library(unit_tests_dispatch_event_o STATIC ${UNIT_TESTS_DISPATCH_EVENT_SRC})
+
+target_link_libraries(unit_tests_dispatch_event_o PRIVATE test_metal_common_libs)
+
+target_include_directories(
+    unit_tests_dispatch_event_o
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+
+add_executable(unit_tests_dispatch_event $<TARGET_OBJECTS:unit_tests_dispatch_event_o>)
+
+target_link_libraries(unit_tests_dispatch_event PRIVATE test_metal_common_libs)
+
+set_target_properties(
+    unit_tests_dispatch_event
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
+
+TT_ENABLE_UNITY_BUILD(unit_tests_dispatch_event)
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp
similarity index 96%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp
rename to tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp
index 1c08c86fa15..52df4497afa 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_EnqueueWaitForEvent.cpp
@@ -4,12 +4,11 @@
 
 #include <memory>
 
-#include "command_queue_fixture.hpp"
+#include "multi_command_queue_fixture.hpp"
 #include "tt_metal/common/logger.hpp"
 #include "gtest/gtest.h"
 #include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "command_queue_test_utils.hpp"
+#include "dispatch_test_utils.hpp"
 #include "tt_metal/impl/event/event.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
@@ -23,14 +22,12 @@ void FinishAllCqs(vector<std::reference_wrapper<CommandQueue>>& cqs) {
         Finish(cqs[i]);
     }
 }
-
-
 }
 
 namespace basic_tests {
 
 // Simplest test to record Event per CQ and wait from host, and verify populated Event struct is correct (many events, wrap issue queue)
-TEST_F(MultiCommandQueueMultiDeviceFixture, TestEventsEventSynchronizeSanity) {
+TEST_F(MultiCommandQueueMultiDeviceEventFixture, TestEventsEventSynchronizeSanity) {
     for (Device *device : devices_) {
         tt::log_info("Running On Device {}", device->id());
         vector<std::reference_wrapper<CommandQueue>> cqs = {device->command_queue(0), device->command_queue(1)};
@@ -70,7 +67,7 @@ TEST_F(MultiCommandQueueMultiDeviceFixture, TestEventsEventSynchronizeSanity) {
 }
 
 // Simplest test to record Event per CQ and wait from host, and verify populated Event struct is correct (many events, wrap issue queue)
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEventSynchronizeSanity) {
+TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsEventSynchronizeSanity) {
     vector<std::reference_wrapper<CommandQueue>> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)};
     vector<uint32_t> cmds_issued_per_cq = {0, 0};
 
@@ -108,7 +105,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEventSynchronizeSanity) {
 }
 
 // Simplest test to record and wait-for-events on same CQ.
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventSanity) {
+TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsEnqueueWaitForEventSanity) {
     vector<std::reference_wrapper<CommandQueue>> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)};
     vector<uint32_t> events_issued_per_cq = {0, 0};
     size_t num_events = 10;
@@ -136,7 +133,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventSanity
 
 // Record event on one CQ, wait-for-that-event on another CQ. Then do the flip. Occasionally insert
 // syncs from Host per CQ, and verify completion queues per CQ are correct.
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventCrossCQs) {
+TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsEnqueueWaitForEventCrossCQs) {
     vector<std::reference_wrapper<CommandQueue>> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)};
     vector<uint32_t> cmds_issued_per_cq = {0, 0};
     const size_t num_events_per_cq = 10;
@@ -200,7 +197,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventCrossC
 
 // Simple 2CQ test to mix reads, writes, record-event, wait-for-event in a basic way. It's simple because
 // the write, record-event, wait-event, read-event are all on the same CQ, but cover both CQ's.
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEventSameCQ) {
+TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsReadWriteWithWaitForEventSameCQ) {
     TestBufferConfig config = {.num_pages = 1, .page_size = 256, .buftype = BufferType::DRAM};
     vector<std::reference_wrapper<CommandQueue>> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)};
     vector<uint32_t> cmds_issued_per_cq = {0, 0};
@@ -247,7 +244,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent
 // More interesting test where Blocking ReadBuffer, Non-Blocking WriteBuffer are on alternate CQs,
 // ordered via events. Do many loops, occasionally increasing size of buffers (page size, num pages).
 // Ensure read back data is correct, data is different for each write.
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEventCrossCQs) {
+TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsReadWriteWithWaitForEventCrossCQs) {
     if (tt::Cluster::instance().arch() == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP() << "Skipping for GS due to readback mismatch under debug Github issue #6281 ";
     }
@@ -307,7 +304,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent
 // 2 CQs with single Buffer, and a loop where each iteration has non-blocking Write to Buffer via CQ0 and non-blocking Read
 // to Bufffer via CQ1. Ping-Pongs between Writes and Reads to same buffer. Use events to synchronze read after write and
 // write after read before checking correct data read at the end after all cmds finished on device.
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEventCrossCQsPingPong) {
+TEST_F(MultiCommandQueueSingleDeviceEventFixture, TestEventsReadWriteWithWaitForEventCrossCQsPingPong) {
     if (tt::Cluster::instance().arch() == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP() << "Skipping for GS due to readback mismatch under debug Github issue #6281 ";
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp
rename to tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp
index 023462a6cd2..6f26024a085 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_event/test_events.cpp
@@ -3,9 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "command_queue_fixture.hpp"
-#include "command_queue_test_utils.hpp"
 #include "gtest/gtest.h"
-#include "tt_metal/common/bfloat16.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "impl/debug/watcher_server.hpp"
@@ -15,15 +13,15 @@
 using std::vector;
 using namespace tt::tt_metal;
 
+constexpr uint32_t completion_queue_event_offset = sizeof(CQDispatchCmd);
+constexpr uint32_t completion_queue_page_size = dispatch_constants::TRANSFER_PAGE_SIZE;
+
 enum class DataMovementMode: uint8_t {
     WRITE = 0,
     READ = 1
 };
 
-constexpr uint32_t completion_queue_event_offset = sizeof(CQDispatchCmd);
-constexpr uint32_t completion_queue_page_size = dispatch_constants::TRANSFER_PAGE_SIZE;
-
-TEST_F(CommandQueueFixture, TestEventsDataMovementWrittenToCompletionQueueInOrder) {
+TEST_F(CommandQueueEventFixture, TestEventsDataMovementWrittenToCompletionQueueInOrder) {
     size_t num_buffers = 100;
     uint32_t page_size = 2048;
     vector<uint32_t> page(page_size / sizeof(uint32_t));
@@ -75,7 +73,7 @@ TEST_F(CommandQueueFixture, TestEventsDataMovementWrittenToCompletionQueueInOrde
 }
 
 // Basic test, record events, check that Event struct was updated. Enough commands to trigger issue queue wrap.
-TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventIssueQueueWrap) {
+TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventIssueQueueWrap) {
 
     const size_t num_events = 100000; // Enough to wrap issue queue. 768MB and cmds are 22KB each, so 35k cmds.
     uint32_t cmds_issued_per_cq = 0;
@@ -96,7 +94,7 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventIssueQueueWrap) {
 }
 
 // Test where Host synchronously waits for event to be completed.
-TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronize) {
+TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventAndSynchronize) {
     const size_t num_events = 100;
     const size_t num_events_between_sync = 10;
 
@@ -128,7 +126,7 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronize) {
 
 // Negative test. Host syncing on a future event that isn't actually issued.
 // Ensure that expected hang is seen, which indicates event sync feature is working properly.
-TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) {
+TEST_F(CommandQueueEventFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) {
     tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout.
 
     auto future_event = std::make_shared<Event>();
@@ -155,7 +153,7 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) {
 
 // Negative test. Device sync. Single CQ here syncing on a future event that isn't actually issued.
 // Ensure that expected hang is seen, which indicates event sync feature is working properly.
-TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventHang) {
+TEST_F(CommandQueueEventFixture, TestEventsQueueWaitForEventHang) {
     // Skip this test until #7216 is implemented.
     GTEST_SKIP();
     tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout.
@@ -183,7 +181,7 @@ TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventHang) {
 }
 
 // Device sync. Single CQ here, less interesting than 2CQ but still useful. Ensure no hangs.
-TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) {
+TEST_F(CommandQueueEventFixture, TestEventsQueueWaitForEventBasic) {
 
     const size_t num_events = 50;
     const size_t num_events_between_sync = 5;
@@ -214,7 +212,7 @@ TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) {
 }
 
 // Device sync. Single CQ here, less interesting than 2CQ but still useful. Ensure no hangs.
-TEST_F(CommandQueueFixture, TestEventsEventsQueryBasic) {
+TEST_F(CommandQueueEventFixture, TestEventsEventsQueryBasic) {
 
     const size_t num_events = 50;
     const size_t num_events_between_query = 5;
@@ -260,7 +258,7 @@ TEST_F(CommandQueueFixture, TestEventsEventsQueryBasic) {
 
 
 // Mix of WritesBuffers, RecordEvent, WaitForEvent, EventSynchronize with some checking.
-TEST_F(CommandQueueFixture, TestEventsMixedWriteBufferRecordWaitSynchronize) {
+TEST_F(CommandQueueEventFixture, TestEventsMixedWriteBufferRecordWaitSynchronize) {
     const size_t num_buffers = 2;
     const uint32_t page_size = 2048;
     vector<uint32_t> page(page_size / sizeof(uint32_t));
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt
new file mode 100644
index 00000000000..261109184f8
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(UNIT_TESTS_DISPATCH_PROGRAM_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch_stress.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_dispatch.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueProgram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp
+)
+
+add_library(unit_tests_dispatch_program_o STATIC ${UNIT_TESTS_DISPATCH_PROGRAM_SRC})
+
+target_link_libraries(unit_tests_dispatch_program_o PRIVATE test_metal_common_libs)
+
+target_include_directories(
+    unit_tests_dispatch_program_o
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+
+add_executable(unit_tests_dispatch_program $<TARGET_OBJECTS:unit_tests_dispatch_program_o>)
+
+target_link_libraries(unit_tests_dispatch_program PRIVATE test_metal_common_libs)
+
+set_target_properties(
+    unit_tests_dispatch_program
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
+
+TT_ENABLE_UNITY_BUILD(unit_tests_dispatch_program)
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp
new file mode 100644
index 00000000000..b15f3cc8deb
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/program_with_kernel_created_from_string_fixture.hpp
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include "dispatch_fixture.hpp"
+
+class ProgramWithKernelCreatedFromStringFixture : public DispatchFixture {
+   protected:
+    void SetUp() override {
+        DispatchFixture::SetUp();
+        for (Device *device : this->devices_)
+        {
+            const chip_id_t device_id = device->id();
+            this->device_ids_to_devices_[device_id] = device;
+        }
+    }
+
+    void TearDown() override {
+        detail::CloseDevices(this->device_ids_to_devices_);
+    }
+
+   private:
+    std::map<chip_id_t, Device *> device_ids_to_devices_;
+};
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp
similarity index 80%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
rename to tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp
index 90a7b9221a3..6e3127e8471 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp
@@ -4,18 +4,17 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include "command_queue_fixture.hpp"
-#include "command_queue_test_utils.hpp"
+#include "multi_command_queue_fixture.hpp"
+#include "random_program_fixture.hpp"
+#include "dispatch_test_utils.hpp"
 #include "gtest/gtest.h"
 #include "impl/buffers/buffer.hpp"
 #include "impl/device/device.hpp"
 #include "impl/kernels/kernel_types.hpp"
-#include "tt_metal/common/bfloat16.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/impl/kernels/kernel.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp"
 #include "tt_soc_descriptor.h"
 
 using std::vector;
@@ -41,7 +40,6 @@ struct DummyProgramMultiCBConfig {
     uint32_t num_sems;
 };
 
-
 namespace local_test_functions {
 
 void initialize_dummy_kernels(Program& program, const CoreRangeSet& cr_set) {
@@ -114,6 +112,41 @@ bool cb_config_successful(Device* device, Program &program, const DummyProgramMu
     return pass;
 }
 
+bool test_dummy_EnqueueProgram_with_runtime_args(Device* device, const CoreCoord& eth_core_coord) {
+    Program program;
+    bool pass = true;
+    auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_core_coord);
+
+    constexpr uint32_t num_runtime_args0 = 9;
+    constexpr uint32_t rta_base0 = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    std::map<string, string> dummy_defines0 = {{"DATA_MOVEMENT", "1"},
+                                               {"NUM_RUNTIME_ARGS", std::to_string(num_runtime_args0)},
+                                               {"RESULTS_ADDR", std::to_string(rta_base0)}};
+    auto dummy_kernel0 = CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/test_kernels/misc/runtime_args_kernel.cpp",
+        eth_core_coord,
+        tt::tt_metal::EthernetConfig{.noc = tt::tt_metal::NOC::NOC_0, .defines = dummy_defines0});
+
+    vector<uint32_t> dummy_kernel0_args = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+    tt::tt_metal::SetRuntimeArgs(program, dummy_kernel0, eth_core_coord, dummy_kernel0_args);
+
+    tt::tt_metal::detail::CompileProgram(device, program);
+    auto& cq = device->command_queue();
+    EnqueueProgram(cq, program, false);
+    Finish(cq);
+
+    vector<uint32_t> dummy_kernel0_args_readback = tt::llrt::read_hex_vec_from_core(
+        device->id(),
+        eth_noc_xy,
+        eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE,
+        dummy_kernel0_args.size() * sizeof(uint32_t));
+
+    pass &= (dummy_kernel0_args == dummy_kernel0_args_readback);
+
+    return pass;
+}
+
 bool test_dummy_EnqueueProgram_with_cbs(Device* device, CommandQueue& cq, DummyProgramMultiCBConfig& program_config) {
     Program program;
 
@@ -651,7 +684,7 @@ namespace basic_tests {
 
 namespace compiler_workaround_hardware_bug_tests {
 
-TEST_F(CommandQueueSingleCardFixture, TestArbiterDoesNotHang) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestArbiterDoesNotHang) {
     for (Device *device : devices_) {
         Program program;
 
@@ -669,7 +702,7 @@ TEST_F(CommandQueueSingleCardFixture, TestArbiterDoesNotHang) {
 }
 namespace single_core_tests {
 
-TEST_F(CommandQueueSingleCardFixture, TestSingleCbConfigCorrectlySentSingleCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSingleCbConfigCorrectlySentSingleCore) {
     CoreRange cr({0, 0}, {0, 0});
     CoreRangeSet cr_set({cr});
 
@@ -682,7 +715,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSingleCbConfigCorrectlySentSingleCore)
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestMultiCbSeqConfigCorrectlySentSingleCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCbSeqConfigCorrectlySentSingleCore) {
     CoreRange cr({0, 0}, {0, 0});
     CoreRangeSet cr_set({cr});
 
@@ -699,7 +732,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbSeqConfigCorrectlySentSingleCor
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestMultiCbRandomConfigCorrectlySentSingleCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCbRandomConfigCorrectlySentSingleCore) {
     CoreRange cr({0, 0}, {0, 0});
     CoreRangeSet cr_set({cr});
 
@@ -716,7 +749,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbRandomConfigCorrectlySentSingle
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestMultiCBSharedAddressSpaceSentSingleCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCBSharedAddressSpaceSentSingleCore) {
     CoreRange cr({0, 0}, {0, 0});
     CoreRangeSet cr_set({cr});
 
@@ -770,7 +803,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCBSharedAddressSpaceSentSingleCor
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestSingleCbConfigCorrectlyUpdateSizeSentSingleCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSingleCbConfigCorrectlyUpdateSizeSentSingleCore) {
     CoreRange cr({0, 0}, {0, 0});
     CoreRangeSet cr_set({cr});
 
@@ -783,7 +816,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSingleCbConfigCorrectlyUpdateSizeSentS
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestSingleSemaphoreConfigCorrectlySentSingleCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSingleSemaphoreConfigCorrectlySentSingleCore) {
     CoreRange cr({0, 0}, {0, 0});
     CoreRangeSet cr_set({cr});
 
@@ -794,7 +827,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSingleSemaphoreConfigCorrectlySentSing
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestAutoInsertedBlankBriscKernelInDeviceDispatchMode) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAutoInsertedBlankBriscKernelInDeviceDispatchMode) {
     for (Device *device : devices_) {
         Program program;
 
@@ -812,7 +845,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAutoInsertedBlankBriscKernelInDeviceDi
 }
 
 // Sanity test for setting and verifying common and unique runtime args to a single core, the simplest case.
-TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanitySingleCoreCompute) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanitySingleCoreCompute) {
     CoreRange cr0({0, 0}, {0, 0});
     CoreRangeSet cr_set({cr0});
     DummyProgramConfig dummy_program_config = {.cr_set = cr_set};
@@ -821,8 +854,16 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanitySingleCoreComput
     }
 }
 
+TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthEnqueueDummyProgram) {
+    for (const auto& device : devices_) {
+        for (const auto& eth_core : device->get_active_ethernet_cores(true)) {
+            ASSERT_TRUE(local_test_functions::test_dummy_EnqueueProgram_with_runtime_args(device, eth_core));
+        }
+    }
+}
+
 // Sanity test for setting and verifying common and unique runtime args to single cores via ERISC. Some arch may return 0 active eth cores, that's okay.
-TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanitySingleCoreDataMovementErisc) {
+TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthIncrementRuntimeArgsSanitySingleCoreDataMovementErisc) {
     for (Device *device : devices_) {
         for (const auto &eth_core : device->get_active_ethernet_cores(true)) {
             CoreRange cr0(eth_core);
@@ -836,7 +877,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanitySingleCoreDataMo
 
 // Sanity test for setting and verifying common and unique runtime args to single cores via ERISC(IDLE). Some arch may return 0 active eth cores, that's okay.
 // FIXME - Re-enable when FD-on-idle-eth is supported
-TEST_F(CommandQueueSingleCardFixture, DISABLED_IncrementRuntimeArgsSanitySingleCoreDataMovementEriscIdle) {
+TEST_F(CommandQueueSingleCardProgramFixture, DISABLED_ActiveEthIncrementRuntimeArgsSanitySingleCoreDataMovementEriscIdle) {
     for (Device *device : devices_) {
         for (const auto &eth_core : device->get_active_ethernet_cores(true)) {
             CoreRange cr0(eth_core);
@@ -850,7 +891,7 @@ TEST_F(CommandQueueSingleCardFixture, DISABLED_IncrementRuntimeArgsSanitySingleC
 
 // Sanity test for setting and verifying common and unique runtime args to single cores via inactive ERISC cores. Some arch may return 0 active eth cores, that's okay.
 // FIXME - Re-enable when FD-on-idle-eth is supported
-TEST_F(CommandQueueSingleCardFixture, DISABLED_IncrementRuntimeArgsSanitySingleCoreDataMovementEriscInactive) {
+TEST_F(CommandQueueSingleCardProgramFixture, DISABLED_IdleEthIncrementRuntimeArgsSanitySingleCoreDataMovementEriscInactive) {
     for (Device *device : devices_) {
         for (const auto &eth_core : device->get_inactive_ethernet_cores()) {
             CoreRange cr0(eth_core);
@@ -862,7 +903,7 @@ TEST_F(CommandQueueSingleCardFixture, DISABLED_IncrementRuntimeArgsSanitySingleC
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestRuntimeArgsCorrectlySentSingleCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestRuntimeArgsCorrectlySentSingleCore) {
     CoreRange cr({0, 0}, {0, 0});
     CoreRangeSet cr_set({cr});
 
@@ -876,7 +917,7 @@ TEST_F(CommandQueueSingleCardFixture, TestRuntimeArgsCorrectlySentSingleCore) {
 }  // end namespace single_core_tests
 
 namespace multicore_tests {
-TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultiCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllCbConfigsCorrectlySentMultiCore) {
     CBConfig cb_config = {.num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b};
 
     std::vector <CBConfig> cb_config_vector(NUM_CIRCULAR_BUFFERS, cb_config);
@@ -896,7 +937,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultiCore) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMultiCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllCbConfigsCorrectlySentUpdateSizeMultiCore) {
     CBConfig cb_config = {.num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b};
 
     std::vector <CBConfig> cb_config_vector(NUM_CIRCULAR_BUFFERS, cb_config);
@@ -916,7 +957,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMul
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeMultiCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCbConfigsCorrectlySentUpdateSizeMultiCore) {
     CBConfig cb_config_0 = {.cb_id = 0, .num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b};
     CBConfig cb_config_1 = {.cb_id = 1, .num_pages = 2, .page_size = 4096, .data_format = tt::DataFormat::Float16_b};
     CBConfig cb_config_2 = {.cb_id = 2, .num_pages = 2, .page_size = 2048, .data_format = tt::DataFormat::Float16_b};
@@ -937,7 +978,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeM
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultipleCoreRanges) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllCbConfigsCorrectlySentMultipleCoreRanges) {
     CBConfig cb_config = {.num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b};
 
     std::vector<CBConfig> cb_config_vector(NUM_CIRCULAR_BUFFERS, cb_config);
@@ -958,7 +999,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentMultipleCoreR
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMultipleCoreRanges) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllCbConfigsCorrectlySentUpdateSizeMultipleCoreRanges) {
     CBConfig cb_config = {.num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b};
 
     std::vector<CBConfig> cb_config_vector(NUM_CIRCULAR_BUFFERS, cb_config);
@@ -979,7 +1020,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllCbConfigsCorrectlySentUpdateSizeMul
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeMultipleCoreRanges) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestMultiCbConfigsCorrectlySentUpdateSizeMultipleCoreRanges) {
     CBConfig cb_config_0 = {.cb_id = 0, .num_pages = 1, .page_size = 2048, .data_format = tt::DataFormat::Float16_b};
     CBConfig cb_config_1 = {.cb_id = 1, .num_pages = 2, .page_size = 4096, .data_format = tt::DataFormat::Float16_b};
     CBConfig cb_config_2 = {.cb_id = 2, .num_pages = 2, .page_size = 2048, .data_format = tt::DataFormat::Float16_b};
@@ -1001,7 +1042,7 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCbConfigsCorrectlySentUpdateSizeM
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestAllSemConfigsCorrectlySentMultiCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllSemConfigsCorrectlySentMultiCore) {
     for (Device *device : devices_) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
@@ -1014,7 +1055,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllSemConfigsCorrectlySentMultiCore) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestAllSemaphoreConfigsCorrectlySentMultipleCoreRanges) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllSemaphoreConfigsCorrectlySentMultipleCoreRanges) {
     for (Device *device : devices_)
     {
         CoreRange first_cr({0, 0}, {1, 1});
@@ -1054,7 +1095,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllSemaphoreConfigsCorrectlySentMultip
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllRuntimeArgsCorrectlySentMultiCore) {
     for (Device *device : devices_) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
@@ -1066,7 +1107,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore)
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore_255_PerKernel) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestAllRuntimeArgsCorrectlySentMultiCore_255_PerKernel) {
     for (Device *device : devices_) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
@@ -1078,7 +1119,7 @@ TEST_F(CommandQueueSingleCardFixture, TestAllRuntimeArgsCorrectlySentMultiCore_2
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiCoreRange) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSendRuntimeArgsMultiCoreRange) {
     for (Device* device : devices_) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
@@ -1092,7 +1133,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiCoreRange) {
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiNonOverlappingCoreRange) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestSendRuntimeArgsMultiNonOverlappingCoreRange) {
     // Core ranges get merged in kernel groups, this one does not
     for (Device* device : devices_) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
@@ -1107,7 +1148,7 @@ TEST_F(CommandQueueSingleCardFixture, TestSendRuntimeArgsMultiNonOverlappingCore
     }
 }
 
-TEST_F(CommandQueueSingleCardFixture, TestUpdateRuntimeArgsMultiCoreRange) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixTestUpdateRuntimeArgsMultiCoreRange) {
     for (Device* device : devices_) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
 
@@ -1122,7 +1163,7 @@ TEST_F(CommandQueueSingleCardFixture, TestUpdateRuntimeArgsMultiCoreRange) {
 }
 
 // Sanity test for setting and verifying common and unique runtime args to multiple cores.
-TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreCompute) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
     CoreRangeSet cr_set(std::vector{cr0, cr1});
@@ -1133,7 +1174,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
 }
 
 // Max number of 255 unique RT args.
-TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute_255_UniqueArgs) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreCompute_255_UniqueArgs) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
     CoreRangeSet cr_set(std::vector{cr0, cr1});
@@ -1144,7 +1185,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
 }
 
 // Max number of 255 common RT args.
-TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute_255_CommonArgs) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreCompute_255_CommonArgs) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
     CoreRangeSet cr_set(std::vector{cr0, cr1});
@@ -1155,7 +1196,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreCompute
 }
 
 // Sanity test for setting and verifying common and unique runtime args to multiple cores via BRISC.
-TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMovementBrisc) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreDataMovementBrisc) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
     CoreRangeSet cr_set(std::vector{cr0, cr1});
@@ -1166,7 +1207,7 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMov
 }
 
 // Sanity test for setting and verifying common and unique runtime args to multiple cores via NCRISC.
-TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMovementNcrisc) {
+TEST_F(CommandQueueSingleCardProgramFixture, TensixIncrementRuntimeArgsSanityMultiCoreDataMovementNcrisc) {
     CoreRange cr0({1, 1}, {2, 2});
     CoreRange cr1({3, 3}, {4, 4});
     CoreRangeSet cr_set(std::vector{cr0, cr1});
@@ -1182,8 +1223,198 @@ TEST_F(CommandQueueSingleCardFixture, IncrementRuntimeArgsSanityMultiCoreDataMov
 
 namespace stress_tests {
 
+TEST_F(MultiCommandQueueSingleDeviceProgramFixture, TensixTestRandomizedProgram) {
+    uint32_t NUM_PROGRAMS = 100;
+    uint32_t MAX_LOOP = 100;
+    uint32_t page_size = 1024;
+
+    if (this->arch_ == tt::ARCH::BLACKHOLE) {
+        GTEST_SKIP(); // Running on second CQ is hanging on CI
+    }
+
+    // Make random
+    auto random_seed = 0; // (unsigned int)time(NULL);
+    uint32_t seed = tt::parse_env("TT_METAL_SEED", random_seed);
+    log_info(tt::LogTest, "Using Test Seed: {}", seed);
+    srand(seed);
+
+    CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size();
+    CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
+    CoreRangeSet cr_set({cr});
+
+    log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS);
+
+    vector<Program> programs;
+    for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
+        programs.push_back(Program());
+        Program& program = programs.back();
+
+        std::map<string, string> data_movement_defines = {{"DATA_MOVEMENT", "1"}};
+        std::map<string, string> compute_defines = {{"COMPUTE", "1"}};
+
+        // brisc
+        uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS;
+        bool USE_MAX_RT_ARGS;
+
+        if (i == 0) {
+            // Ensures that we get at least one compilation with the max amount to
+            // ensure it compiles and runs
+            BRISC_OUTER_LOOP = MAX_LOOP;
+            BRISC_MIDDLE_LOOP = MAX_LOOP;
+            BRISC_INNER_LOOP = MAX_LOOP;
+            NUM_CBS = NUM_CIRCULAR_BUFFERS;
+            NUM_SEMS = NUM_SEMAPHORES;
+            USE_MAX_RT_ARGS = true;
+        } else {
+            BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
+            BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
+            BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
+            NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1;
+            NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1;
+            USE_MAX_RT_ARGS = false;
+        }
+
+        log_debug(tt::LogTest, "Compiling program {}/{} w/ BRISC_OUTER_LOOP: {} BRISC_MIDDLE_LOOP: {} BRISC_INNER_LOOP: {} NUM_CBS: {} NUM_SEMS: {} USE_MAX_RT_ARGS: {}",
+                 i+1, NUM_PROGRAMS, BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, USE_MAX_RT_ARGS);
+
+        for (uint32_t j = 0; j < NUM_CBS; j++) {
+            CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}).set_page_size(j, page_size * (j + 1));
+            auto cb = CreateCircularBuffer(program, cr_set, cb_config);
+        }
+
+        for (uint32_t j = 0; j < NUM_SEMS; j++) {
+            CreateSemaphore(program, cr_set, j + 1);
+        }
+
+        auto [brisc_unique_rtargs, brisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS);
+        uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size();
+        uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size();
+        vector<uint32_t> brisc_compile_args = {BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_brisc_unique_rtargs, num_brisc_common_rtargs, page_size};
+
+        // ncrisc
+        uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP;
+        if (i == 0) {
+            NCRISC_OUTER_LOOP = MAX_LOOP;
+            NCRISC_MIDDLE_LOOP = MAX_LOOP;
+            NCRISC_INNER_LOOP = MAX_LOOP;
+        } else {
+            NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
+            NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
+            NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
+        }
+
+        auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS);
+        uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size();
+        uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size();
+        vector<uint32_t> ncrisc_compile_args = {NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_ncrisc_unique_rtargs, num_ncrisc_common_rtargs, page_size};
+
+        // trisc
+        uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP;
+        if (i == 0) {
+            TRISC_OUTER_LOOP = MAX_LOOP;
+            TRISC_MIDDLE_LOOP = MAX_LOOP;
+            TRISC_INNER_LOOP = MAX_LOOP;
+        } else {
+            TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
+            TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
+            TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
+        }
+
+        auto [trisc_unique_rtargs, trisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS);
+        uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size();
+        uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size();
+        vector<uint32_t> trisc_compile_args = {TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_trisc_unique_rtargs, num_trisc_common_rtargs, page_size};
+
+        bool at_least_one_kernel = false;
+        if (i == 0 or ((rand() % 2) == 0)) {
+            auto dummy_brisc_kernel = CreateKernel(
+                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
+                    .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines});
+            SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
+            SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
+            at_least_one_kernel = true;
+        }
+
+        if (i == 0 or ((rand() % 2) == 0)) {
+            auto dummy_ncrisc_kernel = CreateKernel(
+                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
+                    .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines});
+            SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
+            SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
+            at_least_one_kernel = true;
+        }
+
+        if (i == 0 or ((rand() % 2) == 0)) {
+            auto dummy_trisc_kernel = CreateKernel(
+                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{
+                    .math_approx_mode = false,
+                    .compile_args = trisc_compile_args,
+                    .defines = compute_defines
+                });
+            SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
+            SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
+            at_least_one_kernel = true;
+        }
+
+        if (not at_least_one_kernel) {
+            uint32_t random_risc = rand() % 3 + 1;
+            if (random_risc == 1) {
+                auto dummy_brisc_kernel = CreateKernel(
+                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
+                        .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines});
+                SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
+                SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
+            } else if (random_risc == 2) {
+                auto dummy_ncrisc_kernel = CreateKernel(
+                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
+                        .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines});
+                SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
+                SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
+            } else if (random_risc == 3) {
+                auto dummy_trisc_kernel = CreateKernel(
+                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{
+                        .math_approx_mode = false,
+                        .compile_args = trisc_compile_args,
+                        .defines = compute_defines
+                    });
+                SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
+                SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
+            } else {
+                TT_THROW("Invalid");
+            }
+        }
+
+        tt::tt_metal::detail::CompileProgram(this->device_, program);
+    }
+
+    for (uint8_t cq_id = 0; cq_id < this->device_->num_hw_cqs(); ++cq_id) {
+        log_info(tt::LogTest, "Running {} programs on cq {} for cache warmup.", programs.size(), (uint32_t)cq_id);
+        // This loop caches program and runs
+        for (Program& program: programs) {
+            EnqueueProgram(this->device_->command_queue(cq_id), program, false);
+        }
+
+        // This loops assumes already cached
+        uint32_t NUM_ITERATIONS = 500; // TODO(agrebenisan): Bump this to 5000, saw hangs for very large number of iterations, need to come back to that
+
+        log_info(tt::LogTest, "Running {} programs on cq {} for {} iterations now.", programs.size(), (uint32_t)cq_id, NUM_ITERATIONS);
+        for (uint32_t i = 0; i < NUM_ITERATIONS; i++) {
+            auto rng = std::default_random_engine {};
+            std::shuffle(std::begin(programs), std::end(programs), rng);
+            if (i % 10 == 0) {
+                log_debug(tt::LogTest, "Enqueueing {} programs on cq {} for iter: {}/{} now.", programs.size(), (uint32_t)cq_id, i+1, NUM_ITERATIONS);
+            }
+            for (Program& program: programs) {
+                EnqueueProgram(this->device_->command_queue(cq_id), program, false);
+            }
+        }
+
+        log_info(tt::LogTest, "Calling Finish.");
+        Finish(this->device_->command_queue(cq_id));
+    }
+}
 
-TEST_F(CommandQueueSingleCardFixture, DISABLED_TestFillDispatchCoreBuffer) {
+TEST_F(CommandQueueSingleCardProgramFixture, DISABLED_TensixTestFillDispatchCoreBuffer) {
     uint32_t NUM_ITER = 100000;
     for (Device *device : devices_) {
         CoreCoord worker_grid_size = device->compute_with_storage_grid_size();
@@ -1197,7 +1428,7 @@ TEST_F(CommandQueueSingleCardFixture, DISABLED_TestFillDispatchCoreBuffer) {
     }
 }
 
-TEST_F(CommandQueueFixture, TestRandomizedProgram) {
+TEST_F(CommandQueueProgramFixture, TensixTestRandomizedProgram) {
     uint32_t NUM_PROGRAMS = 100;
     uint32_t MAX_LOOP = 100;
     uint32_t page_size = 1024;
@@ -1386,7 +1617,7 @@ TEST_F(CommandQueueFixture, TestRandomizedProgram) {
     Finish(this->device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestSimpleProgramsOnTensix) {
+TEST_F(RandomProgramFixture, TensixTestSimplePrograms) {
     for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
         if (i % 10 == 0) {
             log_info(tt::LogTest, "Creating Program {}", i);
@@ -1399,7 +1630,7 @@ TEST_F(RandomProgramFixture, TestSimpleProgramsOnTensix) {
     Finish(device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestSimpleProgramsOnEth) {
+TEST_F(RandomProgramFixture, ActiveEthTestSimplePrograms) {
     if (!does_device_have_active_eth_cores(device_)) {
         GTEST_SKIP() << "Skipping test because device " << device_->id() << " does not have any active ethernet cores";
     }
@@ -1416,7 +1647,7 @@ TEST_F(RandomProgramFixture, TestSimpleProgramsOnEth) {
     Finish(device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestSimpleProgramsOnTensixAndEth) {
+TEST_F(RandomProgramFixture, TensixActiveEthTestSimplePrograms) {
     if (!does_device_have_active_eth_cores(device_)) {
         GTEST_SKIP() << "Skipping test because device " << device_->id() << " does not have any active ethernet cores";
     }
@@ -1442,7 +1673,7 @@ TEST_F(RandomProgramFixture, TestSimpleProgramsOnTensixAndEth) {
     Finish(device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestProgramsOnTensix) {
+TEST_F(RandomProgramFixture, TensixTestPrograms) {
     for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
         if (i % 10 == 0) {
             log_info(tt::LogTest, "Creating Program {}", i);
@@ -1455,7 +1686,7 @@ TEST_F(RandomProgramFixture, TestProgramsOnTensix) {
     Finish(device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestProgramsOnEth) {
+TEST_F(RandomProgramFixture, ActiveEthTestPrograms) {
     if (!does_device_have_active_eth_cores(device_)) {
         GTEST_SKIP() << "Skipping test because device " << device_->id() << " does not have any active ethernet cores";
     }
@@ -1477,7 +1708,7 @@ TEST_F(RandomProgramFixture, TestProgramsOnEth) {
     Finish(device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestProgramsOnTensixAndEth) {
+TEST_F(RandomProgramFixture, TensixActiveEthTestPrograms) {
     if (!does_device_have_active_eth_cores(device_)) {
         GTEST_SKIP() << "Skipping test because device " << device_->id() << " does not have any active ethernet cores";
     }
@@ -1511,7 +1742,7 @@ TEST_F(RandomProgramFixture, TestProgramsOnTensixAndEth) {
     Finish(device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestAlternatingLargeAndSmallProgramsOnTensix) {
+TEST_F(RandomProgramFixture, TensixTestAlternatingLargeAndSmallPrograms) {
     for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
         if (i % 10 == 0) {
             log_info(tt::LogTest, "Creating Program {}", i);
@@ -1532,7 +1763,7 @@ TEST_F(RandomProgramFixture, TestAlternatingLargeAndSmallProgramsOnTensix) {
     Finish(device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestLargeProgramFollowedBySmallProgramsOnTensix) {
+TEST_F(RandomProgramFixture, TensixTestLargeProgramFollowedBySmallPrograms) {
     for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
         if (i % 10 == 0) {
             log_info(tt::LogTest, "Creating Program {}", i);
@@ -1553,7 +1784,7 @@ TEST_F(RandomProgramFixture, TestLargeProgramFollowedBySmallProgramsOnTensix) {
     Finish(device_->command_queue());
 }
 
-TEST_F(RandomProgramFixture, TestLargeProgramInBetweenFiveSmallProgramsOnTensix) {
+TEST_F(RandomProgramFixture, TensixTestLargeProgramInBetweenFiveSmallPrograms) {
     for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
         if (i % 10 == 0) {
             log_info(tt::LogTest, "Creating Program {}", i);
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp
rename to tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp
index d8e3a4fefe1..1cd8a9a0d2d 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/test_dispatch.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch.cpp
@@ -4,13 +4,13 @@
 
 // This file contains dispatch tests that are (generally) dispatch mode agnostic
 
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 
 using std::vector;
 
 // Test sync w/ semaphores betweeen eth/tensix cores
 // Test will hang in the kernel if the sync doesn't work properly
-static void test_sems_across_core_types(CommonFixture *fixture,
+static void test_sems_across_core_types(DispatchFixture *fixture,
                                         vector<tt::tt_metal::v1::DeviceHandle>& devices,
                                         bool active_eth) {
     // just something unique...
@@ -89,7 +89,7 @@ static void test_sems_across_core_types(CommonFixture *fixture,
     }
 }
 
-TEST_F(CommonFixture, TestEthBlank) {
+TEST_F(DispatchFixture, EthTestBlank) {
 
     Device *device = devices_[0];
     Program program = CreateProgram();
@@ -113,7 +113,7 @@ TEST_F(CommonFixture, TestEthBlank) {
     }
 }
 
-TEST_F(CommonFixture, TestTensixInitLocalMemory) {
+TEST_F(DispatchFixture, TensixTestInitLocalMemory) {
 
     // This test will hang/assert if there is a failure
 
@@ -136,7 +136,7 @@ TEST_F(CommonFixture, TestTensixInitLocalMemory) {
     this->RunProgram(device, program);
 }
 
-TEST_F(CommonFixture, TestEthInitLocalMemory) {
+TEST_F(DispatchFixture, EthTestInitLocalMemory) {
 
     // This test will hang/assert if there is a failure
 
@@ -167,11 +167,11 @@ TEST_F(CommonFixture, TestEthInitLocalMemory) {
     }
 }
 
-TEST_F(CommonFixture, TestSemaphoresTensixActiveEth) {
+TEST_F(DispatchFixture, TensixActiveEthTestSemaphores) {
     test_sems_across_core_types(this, this->devices_, true);
 }
 
-TEST_F(CommonFixture, TestSemaphoresTensixIdleEth) {
+TEST_F(DispatchFixture, TensixIdleEthTestSemaphores) {
     if (not this->slow_dispatch_) {
         GTEST_SKIP();
     }
@@ -181,7 +181,7 @@ TEST_F(CommonFixture, TestSemaphoresTensixIdleEth) {
 
 // This test was written to cover issue #12738 (CBs for workers showing up on
 // active eth cores)
-TEST_F(CommonFixture, TestCBsAcrossWorkerEth) {
+TEST_F(DispatchFixture, TensixActiveEthTestCBsAcrossDifferentCoreTypes) {
 
     uint32_t intermediate_cb = 24;
     uint32_t out_cb = 16;
diff --git a/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp
similarity index 76%
rename from tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp
rename to tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp
index fcf79a112f0..1322f2f4331 100644
--- a/tests/tt_metal/tt_metal/test_create_kernel_from_string.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_program_with_kernel_created_from_string.cpp
@@ -4,7 +4,7 @@
 
 #include <gtest/gtest.h>
 
-#include "core_coord.hpp"
+#include "common/core_coord.hpp"
 #include "detail/tt_metal.hpp"
 #include "host_api.hpp"
 #include "impl/device/device.hpp"
@@ -12,32 +12,9 @@
 #include "impl/kernels/kernel_types.hpp"
 #include "impl/program/program.hpp"
 #include "tt_cluster_descriptor_types.h"
+#include "program_with_kernel_created_from_string_fixture.hpp"
 
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
-
-using namespace tt;
-using namespace tt::tt_metal;
-
-class ProgramWithKernelCreatedFromStringFixture : public CommonFixture {
-   protected:
-    void SetUp() override {
-        CommonFixture::SetUp();
-        for (Device *device : this->devices_)
-        {
-            const chip_id_t device_id = device->id();
-            this->device_ids_to_devices_[device_id] = device;
-        }
-    }
-
-    void TearDown() override {
-        detail::CloseDevices(this->device_ids_to_devices_);
-    }
-
-   private:
-    std::map<chip_id_t, Device *> device_ids_to_devices_;
-};
-
-TEST_F(ProgramWithKernelCreatedFromStringFixture, DataMovementKernel) {
+TEST_F(ProgramWithKernelCreatedFromStringFixture, TensixDataMovementKernel) {
     const CoreRange cores({0, 0}, {1, 1});
     const string &kernel_src_code = R"(
     #include "debug/dprint.h"
@@ -62,7 +39,7 @@ TEST_F(ProgramWithKernelCreatedFromStringFixture, DataMovementKernel) {
     };
 }
 
-TEST_F(ProgramWithKernelCreatedFromStringFixture, ComputeKernel) {
+TEST_F(ProgramWithKernelCreatedFromStringFixture, TensixComputeKernel) {
     const CoreRange cores({0, 0}, {1, 1});
     const string &kernel_src_code = R"(
     #include "debug/dprint.h"
@@ -94,7 +71,7 @@ TEST_F(ProgramWithKernelCreatedFromStringFixture, ComputeKernel) {
     };
 }
 
-TEST_F(ProgramWithKernelCreatedFromStringFixture, EthernetKernel) {
+TEST_F(ProgramWithKernelCreatedFromStringFixture, ActiveEthEthernetKernel) {
     const string &kernel_src_code = R"(
     #include "debug/dprint.h"
     #include "dataflow_api.h"
diff --git a/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp
similarity index 80%
rename from tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp
rename to tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp
index 75116172d4d..f6247518800 100644
--- a/tests/tt_metal/tt_metal/unit_tests_frequent/tests/run_many_times.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_dispatch_stress.cpp
@@ -2,9 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include "common/logger.hpp"
 #include "gtest/gtest.h"
 #include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include "host_api.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
 using std::vector;
@@ -14,34 +15,29 @@ using namespace tt::tt_metal;
 void RunTest(Device *device) {
     // Set up program
     Program program = Program();
+    CoreRange core_range({0, 0}, {5, 5});
 
-    std::set<CoreRange> core_ranges;
-    //CoreCoord grid_size = device->logical_grid_size();
-    CoreCoord grid_size = {5, 5};
-    for (uint32_t y = 0; y < grid_size.y; y++) {
-        for (uint32_t x = 0; x < grid_size.x; x++) {
-            CoreCoord core(x, y);
-            core_ranges.insert(CoreRange(core, core));
-        }
-    }
+    auto l1_unreserved_base = device->get_base_allocator_addr(tt_metal::HalMemType::L1);
 
     // Kernels on brisc + ncrisc that just add two numbers
-    KernelHandle brisc_kid = tt_metal::CreateKernel(
+    KernelHandle brisc_kid = CreateKernel(
         program,
         "tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp",
-        CoreRangeSet(core_ranges),
+        core_range,
         tt_metal::DataMovementConfig {
             .processor = DataMovementProcessor::RISCV_0,
-            .noc = NOC::RISCV_0_default
+            .noc = NOC::RISCV_0_default,
+            .compile_args = {l1_unreserved_base}
         }
     );
-    KernelHandle ncrisc_kid = tt_metal::CreateKernel(
+    KernelHandle ncrisc_kid = CreateKernel(
         program,
         "tests/tt_metal/tt_metal/test_kernels/misc/add_two_ints.cpp",
-        CoreRangeSet(core_ranges),
+        core_range,
         tt_metal::DataMovementConfig {
             .processor = DataMovementProcessor::RISCV_1,
-            .noc = NOC::RISCV_1_default
+            .noc = NOC::RISCV_1_default,
+            .compile_args = {l1_unreserved_base + 4}
         }
     );
 
@@ -52,8 +48,8 @@ void RunTest(Device *device) {
     auto get_second_arg = [](Device *device, CoreCoord &core, uint32_t multiplier) {
         return (uint32_t) core.y * 100 * multiplier;
     };
-    for (auto &core_range : core_ranges) {
-        CoreCoord core = core_range.start_coord;
+
+    for (CoreCoord core : core_range) {
         std::vector<uint32_t> brisc_rt_args = {
             get_first_arg(device, core, 1),
             get_second_arg(device, core, 1)
@@ -78,16 +74,14 @@ void RunTest(Device *device) {
     }
 
     // Check results
-    uint32_t l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
-    for (auto &core_range : core_ranges) {
-        CoreCoord core = core_range.start_coord;
+    for (CoreCoord core : core_range) {
         std::vector<uint32_t> brisc_result;
         tt_metal::detail::ReadFromDeviceL1(
             device, core, l1_unreserved_base, sizeof(uint32_t), brisc_result
         );
         std::vector<uint32_t> ncrisc_result;
         tt_metal::detail::ReadFromDeviceL1(
-            device, core, l1_unreserved_base, sizeof(uint32_t), ncrisc_result
+            device, core, l1_unreserved_base + 4, sizeof(uint32_t), ncrisc_result
         );
         uint32_t expected_result = get_first_arg(device, core, 1) + get_second_arg(device, core, 1);
         if (expected_result != brisc_result[0])
@@ -114,13 +108,13 @@ void RunTest(Device *device) {
     }
 }
 
-TEST(Common, AllCoresRunManyTimes) {
+TEST(DispatchStress, TensixRunManyTimes) {
     auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
     // Skip fast dispatch until it's supported for remote device.
     if (!slow_dispatch)
         GTEST_SKIP();
     // Run 500 times to make sure that things work
-    for (int idx = 0; idx < 500; idx++) {
+    for (int idx = 0; idx < 400; idx++) {
         log_info(LogTest, "Running iteration #{}", idx);
         // Need to open/close the device each time in order to reproduce original issue.
         auto num_devices = tt::tt_metal::GetNumAvailableDevices();
@@ -136,11 +130,11 @@ TEST(Common, AllCoresRunManyTimes) {
 
         // Run the test on each device
         for (Device *device : devices_) {
+            log_info(LogTest, "Running on device {}", device->id());
             RunTest(device);
         }
 
         // Close all devices
         tt::tt_metal::detail::CloseDevices(reserved_devices_);
     }
-
 }
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp
new file mode 100644
index 00000000000..f569ffd05c0
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_sub_device.cpp
@@ -0,0 +1,127 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstddef>
+#include <cstdint>
+#include <array>
+#include <tuple>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "tt_metal/common/core_coord.hpp"
+#include "tt_metal/impl/buffers/global_semaphore.hpp"
+#include "tt_metal/impl/device/device.hpp"
+#include "tt_metal/impl/event/event.hpp"
+#include "tt_metal/impl/sub_device/sub_device.hpp"
+#include "tt_metal/test_utils/stimulus.hpp"
+#include "command_queue_fixture.hpp"
+#include "sub_device_test_utils.hpp"
+#include "dispatch_test_utils.hpp"
+
+TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceSynchronization) {
+    uint32_t local_l1_size = 3200;
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
+    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
+    CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2});
+
+    auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true);
+
+    ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1});
+    uint32_t page_size_1 = 32;
+    ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1};
+    auto input_1 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_1.size / sizeof(uint32_t));
+
+    std::array sub_device_ids_to_block = {SubDeviceId{0}};
+    for (Device *device : devices_) {
+        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size);
+
+        shard_config_1.device = device;
+
+        std::vector<CoreCoord> physical_cores_1;
+        physical_cores_1.reserve(sharded_cores_1_vec.size());
+        for (const auto& core : sharded_cores_1_vec) {
+            physical_cores_1.push_back(device->worker_core_from_logical_core(core));
+        }
+
+        device->load_sub_device_manager(sub_device_manager);
+
+        auto [program, syncer_core, global_semaphore] = create_single_sync_program(device, sub_device_2);
+        EnqueueProgram(device->command_queue(), program, false);
+
+        auto buffer_1 = CreateBuffer(shard_config_1, sub_device_ids_to_block[0]);
+
+        // Test blocking synchronize doesn't stall
+        Synchronize(device, 0, sub_device_ids_to_block);
+
+        // Test blocking write buffer doesn't stall
+        EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, true, sub_device_ids_to_block);
+
+        // Test record event won't cause a stall
+        auto event = std::make_shared<Event>();
+        EnqueueRecordEvent(device->command_queue(), event, sub_device_ids_to_block);
+        Synchronize(device, 0, sub_device_ids_to_block);
+
+        // Test blocking read buffer doesn't stall
+        std::vector<uint32_t> output_1;
+        EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true, sub_device_ids_to_block);
+        EXPECT_EQ(input_1, output_1);
+        auto input_1_it = input_1.begin();
+        for (const auto& physical_core : physical_cores_1) {
+            auto readback = tt::llrt::read_hex_vec_from_core(
+                device->id(), physical_core, buffer_1->address(), page_size_1);
+            EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin()));
+            input_1_it += page_size_1 / sizeof(uint32_t);
+        }
+        auto sem_addr = global_semaphore->address();
+        auto physical_syncer_core = device->worker_core_from_logical_core(syncer_core);
+        tt::llrt::write_hex_vec_to_core(device->id(), physical_syncer_core, std::vector<uint32_t>{1}, sem_addr);
+
+        // Full synchronization
+        Synchronize(device);
+    }
+}
+
+TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceBasicPrograms) {
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
+    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
+    uint32_t num_iters = 5;
+    for (Device *device : devices_) {
+        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
+        device->load_sub_device_manager(sub_device_manager);
+
+        auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(device, sub_device_1, sub_device_2);
+
+        for (uint32_t i = 0; i < num_iters; i++) {
+            EnqueueProgram(device->command_queue(), waiter_program, false);
+            // Test blocking on one sub-device
+            EnqueueProgram(device->command_queue(), syncer_program, true);
+            EnqueueProgram(device->command_queue(), incrementer_program, false);
+        }
+        Synchronize(device);
+    }
+}
+
+TEST_F(CommandQueueSingleCardFixture, TensixActiveEthTestSubDeviceBasicEthPrograms) {
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
+    uint32_t num_iters = 5;
+    for (Device *device : devices_) {
+        if (!does_device_have_active_eth_cores(device)) {
+            GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores";
+        }
+        auto eth_core = *device->get_active_ethernet_cores(true).begin();
+        SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})}), CoreRangeSet(CoreRange(eth_core, eth_core))});
+        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
+        device->load_sub_device_manager(sub_device_manager);
+
+        auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_eth_sync_program(device, sub_device_1, sub_device_2);
+
+        for (uint32_t i = 0; i < num_iters; i++) {
+            EnqueueProgram(device->command_queue(), waiter_program, false);
+            // Test blocking on one sub-device
+            EnqueueProgram(device->command_queue(), syncer_program, true);
+            EnqueueProgram(device->command_queue(), incrementer_program, false);
+        }
+        Synchronize(device);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp
new file mode 100644
index 00000000000..b0065f0b6b0
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <cstdint>
+#include "host_api.hpp"
+#include "impl/kernels/kernel.hpp"
+
+struct TestBufferConfig {
+    uint32_t num_pages;
+    uint32_t page_size;
+    BufferType buftype;
+};
+
+inline std::vector<uint32_t> generate_arange_vector(uint32_t size_bytes, uint32_t start = 0) {
+    TT_FATAL(size_bytes % sizeof(uint32_t) == 0, "Error");
+    std::vector<uint32_t> src(size_bytes / sizeof(uint32_t), 0);
+
+    for (uint32_t i = 0; i < src.size(); i++) {
+        src.at(i) = start + i;
+    }
+    return src;
+}
+
+inline std::pair<std::shared_ptr<tt::tt_metal::Buffer>, std::vector<uint32_t>> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) {
+    // This function just enqueues a buffer (which should be large in the config)
+    // write as a precursor to testing the wrap mechanism
+    size_t buf_size = config.num_pages * config.page_size;
+    auto buffer = Buffer::create(device, buf_size, config.page_size, config.buftype);
+
+    std::vector<uint32_t> src = create_random_vector_of_bfloat16(
+      buf_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
+
+    EnqueueWriteBuffer(cq, *buffer, src, false);
+    return std::make_pair(std::move(buffer), src);
+}
+
+inline bool does_device_have_active_eth_cores(const Device *device) {
+    return !(device->get_active_ethernet_cores(true).empty());
+}
+
+inline std::pair<std::vector<uint32_t>, std::vector<uint32_t>> create_runtime_args(
+    const uint32_t num_unique_rt_args,
+    const uint32_t num_common_rt_args,
+    const uint32_t unique_base,
+    const uint32_t common_base) {
+    TT_FATAL(
+        num_unique_rt_args + num_common_rt_args <= tt::tt_metal::max_runtime_args,
+        "Number of unique runtime args and common runtime args exceeds the maximum limit of {} runtime args",
+        tt::tt_metal::max_runtime_args);
+
+    std::vector<uint32_t> common_rt_args;
+    for (uint32_t i = 0; i < num_common_rt_args; i++) {
+        common_rt_args.push_back(common_base + i);
+    }
+
+    std::vector<uint32_t> unique_rt_args;
+    for (uint32_t i = 0; i < num_unique_rt_args; i++) {
+        unique_rt_args.push_back(unique_base + i);
+    }
+
+    return std::make_pair(unique_rt_args, common_rt_args);
+}
+
+// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two.
+// Optionally force the max size for one of the vectors.
+inline std::pair<std::vector<uint32_t>, std::vector<uint32_t>> create_runtime_args(
+    const bool force_max_size = false, const uint32_t unique_base = 0, const uint32_t common_base = 100) {
+    uint32_t num_rt_args_unique = rand() % (tt::tt_metal::max_runtime_args + 1);
+    uint32_t num_rt_args_common =
+        num_rt_args_unique < tt::tt_metal::max_runtime_args ? rand() % (tt::tt_metal::max_runtime_args - num_rt_args_unique + 1) : 0;
+
+    if (force_max_size) {
+        if (rand() % 2) {
+            num_rt_args_unique = tt::tt_metal::max_runtime_args;
+            num_rt_args_common = 0;
+        } else {
+            num_rt_args_common = tt::tt_metal::max_runtime_args;
+            num_rt_args_unique = 0;
+        }
+    }
+
+    log_trace(
+        tt::LogTest,
+        "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}",
+        __FUNCTION__,
+        num_rt_args_unique,
+        num_rt_args_common,
+        force_max_size);
+
+    return create_runtime_args(num_rt_args_unique, num_rt_args_common, unique_base, common_base);
+}
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt
new file mode 100644
index 00000000000..f7092ac68e5
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(UNIT_TESTS_DISPATCH_TRACE_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_EnqueueTrace.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_sub_device.cpp
+)
+
+add_library(unit_tests_dispatch_trace_o STATIC ${UNIT_TESTS_DISPATCH_TRACE_SRC})
+
+target_link_libraries(unit_tests_dispatch_trace_o PRIVATE test_metal_common_libs)
+
+target_include_directories(
+    unit_tests_dispatch_trace_o
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/dispatch/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+
+add_executable(unit_tests_dispatch_trace $<TARGET_OBJECTS:unit_tests_dispatch_trace_o>)
+
+target_link_libraries(unit_tests_dispatch_trace PRIVATE test_metal_common_libs)
+
+set_target_properties(
+    unit_tests_dispatch_trace
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
+
+TT_ENABLE_UNITY_BUILD(unit_tests_dispatch_trace)
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp
similarity index 80%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
rename to tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp
index c5e46f1fd3b..8414bc83ba5 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp
@@ -3,12 +3,11 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cstdint>
-#include <memory>
-#include <unordered_map>
 #include <vector>
 
-#include "command_queue_fixture.hpp"
-#include "command_queue_test_utils.hpp"
+#include "multi_command_queue_fixture.hpp"
+#include "random_program_fixture.hpp"
+#include "dispatch_test_utils.hpp"
 #include "detail/tt_metal.hpp"
 #include "tt_metal/common/env_lib.hpp"
 #include "gtest/gtest.h"
@@ -52,8 +51,8 @@ Program create_simple_unary_program(Buffer& input, Buffer& output) {
             .compile_args = {1, 1},
             .defines = {{"SFPU_OP_EXP_INCLUDE", "1"}, {"SFPU_OP_CHAIN_0", "exp_tile_init(); exp_tile(0);"}}});
 
-    CircularBufferConfig input_cb_config = CircularBufferConfig(2048, {{0, tt::DataFormat::Float16_b}})
-            .set_page_size(0, 2048);
+    CircularBufferConfig input_cb_config = CircularBufferConfig(2048, {{tt::CBIndex::c_0, tt::DataFormat::Float16_b}})
+            .set_page_size(tt::CBIndex::c_0, 2048);
 
     CoreRange core_range({0, 0});
     CreateCircularBuffer(program, core_range, input_cb_config);
@@ -77,8 +76,8 @@ Program create_simple_unary_program(Buffer& input, Buffer& output) {
     SetRuntimeArgs(device, detail::GetKernel(program, writer_kernel), worker, writer_runtime_args);
     SetRuntimeArgs(device, detail::GetKernel(program, reader_kernel), worker, reader_runtime_args);
 
-    CircularBufferConfig output_cb_config = CircularBufferConfig(2048, {{16, tt::DataFormat::Float16_b}})
-            .set_page_size(16, 2048);
+    CircularBufferConfig output_cb_config = CircularBufferConfig(2048, {{tt::CBIndex::c_16, tt::DataFormat::Float16_b}})
+            .set_page_size(tt::CBIndex::c_16, 2048);
 
     CreateCircularBuffer(program, core_range, output_cb_config);
     return program;
@@ -92,8 +91,176 @@ constexpr bool kBlocking = true;
 constexpr bool kNonBlocking = false;
 vector<bool> blocking_flags = {kBlocking, kNonBlocking};
 
-TEST_F(SingleDeviceTraceFixture, InstantiateTraceSanity) {
-    Setup(2048);
+TEST_F(MultiCommandQueueSingleDeviceTraceFixture, TensixEnqueueOneProgramTrace) {
+    CreateDevice(2048);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+
+    CommandQueue& command_queue = this->device_->command_queue(0);
+    CommandQueue& data_movement_queue = this->device_->command_queue(1);
+
+    Program simple_program = create_simple_unary_program(*input, *output);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
+    for (uint32_t i = 0; i < input_data.size(); i++) {
+        input_data[i] = i;
+    }
+
+    // Eager mode
+    vector<uint32_t> eager_output_data;
+    eager_output_data.resize(input_data.size());
+
+    EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true);
+    EnqueueProgram(command_queue, simple_program, true);
+    EnqueueReadBuffer(data_movement_queue, output, eager_output_data.data(), true);
+
+    // Trace mode
+    vector<uint32_t> trace_output_data;
+    trace_output_data.resize(input_data.size());
+
+    EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true);
+
+    uint32_t tid = BeginTraceCapture(this->device_, command_queue.id());
+    EnqueueProgram(command_queue, simple_program, false);
+    EndTraceCapture(this->device_, command_queue.id(), tid);
+
+    EnqueueTrace(command_queue, tid, true);
+    EnqueueReadBuffer(data_movement_queue, *output, trace_output_data.data(), true);
+    EXPECT_TRUE(eager_output_data == trace_output_data);
+
+    // Done
+    Finish(command_queue);
+    ReleaseTrace(this->device_, tid);
+}
+
+TEST_F(MultiCommandQueueSingleDeviceTraceFixture, TensixEnqueueOneProgramTraceLoops) {
+    CreateDevice(4096);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+
+    CommandQueue& command_queue = this->device_->command_queue(0);
+    CommandQueue& data_movement_queue = this->device_->command_queue(1);
+
+    Program simple_program = create_simple_unary_program(*input, *output);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
+    for (uint32_t i = 0; i < input_data.size(); i++) {
+        input_data[i] = i;
+    }
+
+    // Trace mode output
+    uint32_t num_loops = 10;
+    vector<vector<uint32_t>> trace_outputs;
+
+    for (auto i = 0; i < num_loops; i++) {
+        trace_outputs.push_back({});
+        trace_outputs[i].resize(input_data.size());
+    }
+
+    // Compile
+    EnqueueProgram(command_queue, simple_program, true);
+
+    // Trace mode execution
+    uint32_t trace_id = 0;
+    bool trace_captured = false;
+    for (auto i = 0; i < num_loops; i++) {
+        EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true);
+
+        if (not trace_captured) {
+            trace_id = BeginTraceCapture(this->device_, command_queue.id());
+            EnqueueProgram(command_queue, simple_program, false);
+            EndTraceCapture(this->device_, command_queue.id(), trace_id);
+            trace_captured = true;
+        }
+
+        EnqueueTrace(command_queue, trace_id, false);
+        EnqueueReadBuffer(data_movement_queue, *output, trace_outputs[i].data(), true);
+
+        // Expect same output across all loops
+        EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]);
+    }
+
+    // Done
+    Finish(command_queue);
+    ReleaseTrace(this->device_, trace_id);
+}
+
+TEST_F(MultiCommandQueueSingleDeviceTraceFixture, TensixEnqueueOneProgramTraceBenchmark) {
+    CreateDevice(6144);
+    auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+    auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
+
+    constexpr bool kBlocking = true;
+    constexpr bool kNonBlocking = false;
+    vector<bool> blocking_flags = {kBlocking, kNonBlocking};
+
+    // Single Q for data and commands
+    // Keep this queue in passthrough mode for now
+    CommandQueue& command_queue = this->device_->command_queue(0);
+
+    auto simple_program = create_simple_unary_program(*input, *output);
+    vector<uint32_t> input_data(input->size() / sizeof(uint32_t), 0);
+    for (uint32_t i = 0; i < input_data.size(); i++) {
+        input_data[i] = i;
+    }
+
+    // Trace mode output
+    uint32_t num_loops = 10;
+    vector<vector<uint32_t>> trace_outputs;
+
+    for (auto i = 0; i < num_loops; i++) {
+        trace_outputs.push_back({});
+        trace_outputs[i].resize(input_data.size());
+    }
+
+    // Eager mode
+    vector<uint32_t> expected_output_data;
+    vector<uint32_t> eager_output_data;
+    expected_output_data.resize(input_data.size());
+    eager_output_data.resize(input_data.size());
+
+    // Warm up and use the eager blocking run as the expected output
+    EnqueueWriteBuffer(command_queue, *input, input_data.data(), kBlocking);
+    EnqueueProgram(command_queue, simple_program, kBlocking);
+    EnqueueReadBuffer(command_queue, *output, expected_output_data.data(), kBlocking);
+    Finish(command_queue);
+
+    for (bool blocking : blocking_flags) {
+        std::string mode = blocking ? "Eager-B" : "Eager-NB";
+        for (auto i = 0; i < num_loops; i++) {
+            tt::ScopedTimer timer(mode + " loop " + std::to_string(i));
+            EnqueueWriteBuffer(command_queue, *input, input_data.data(), blocking);
+            EnqueueProgram(command_queue, simple_program, blocking);
+            EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), blocking);
+        }
+        if (not blocking) {
+            // (Optional) wait for the last non-blocking command to finish
+            Finish(command_queue);
+        }
+        EXPECT_TRUE(eager_output_data == expected_output_data);
+    }
+
+    // Capture trace on a trace queue
+    uint32_t tid = BeginTraceCapture(this->device_, command_queue.id());
+    EnqueueProgram(command_queue, simple_program, false);
+    EndTraceCapture(this->device_, command_queue.id(), tid);
+
+    // Trace mode execution
+    for (auto i = 0; i < num_loops; i++) {
+        tt::ScopedTimer timer("Trace loop " + std::to_string(i));
+        EnqueueWriteBuffer(command_queue, *input, input_data.data(), kNonBlocking);
+        EnqueueTrace(command_queue, tid, kNonBlocking);
+        EnqueueReadBuffer(command_queue, *output, trace_outputs[i].data(), kNonBlocking);
+    }
+    Finish(command_queue);
+
+    // Expect same output across all loops
+    for (auto i = 0; i < num_loops; i++) {
+        EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]);
+    }
+    ReleaseTrace(this->device_, tid);
+}
+
+TEST_F(CommandQueueTraceFixture, TensixInstantiateTraceSanity) {
+    CreateDevice(2048);
     CommandQueue& command_queue = this->device_->command_queue();
 
     auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
@@ -124,8 +291,8 @@ TEST_F(SingleDeviceTraceFixture, InstantiateTraceSanity) {
     ReleaseTrace(this->device_, tid);
 }
 
-TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) {
-    Setup(2048);
+TEST_F(CommandQueueTraceFixture, TensixEnqueueProgramTraceCapture) {
+    CreateDevice(2048);
     auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
     auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
@@ -167,8 +334,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramTraceCapture) {
     ReleaseTrace(this->device_, tid);
 }
 
-TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) {
-    Setup(2048);
+TEST_F(CommandQueueTraceFixture, TensixEnqueueProgramDeviceCapture) {
+    CreateDevice(2048);
     auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
     auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
 
@@ -218,8 +385,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueProgramDeviceCapture) {
     ReleaseTrace(this->device_, tid);
 }
 
-TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) {
-    Setup(6144);
+TEST_F(CommandQueueTraceFixture, TensixEnqueueTwoProgramTrace) {
+    CreateDevice(6144);
     // Get command queue from device for this test, since its running in async mode
     CommandQueue& command_queue = this->device_->command_queue();
 
@@ -294,8 +461,8 @@ TEST_F(SingleDeviceTraceFixture, EnqueueTwoProgramTrace) {
     }
 }
 
-TEST_F(SingleDeviceTraceFixture, EnqueueMultiProgramTraceBenchmark) {
-    Setup(6144);
+TEST_F(CommandQueueTraceFixture, TensixEnqueueMultiProgramTraceBenchmark) {
+    CreateDevice(6144);
     CommandQueue& command_queue = this->device_->command_queue();
 
     auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM);
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp
new file mode 100644
index 00000000000..bce5b241610
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_sub_device.cpp
@@ -0,0 +1,269 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstddef>
+#include <cstdint>
+#include <array>
+#include <tuple>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "tt_metal/common/core_coord.hpp"
+#include "tt_metal/impl/buffers/global_semaphore.hpp"
+#include "tt_metal/impl/device/device.hpp"
+#include "tt_metal/impl/event/event.hpp"
+#include "tt_metal/impl/sub_device/sub_device.hpp"
+#include "command_queue_fixture.hpp"
+#include "dispatch_test_utils.hpp"
+#include "sub_device_test_utils.hpp"
+
+TEST_F(CommandQueueSingleCardTraceFixture, TensixTestSubDeviceTraceBasicPrograms) {
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
+    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
+    uint32_t num_iters = 5;
+    for (Device *device : devices_) {
+        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
+        device->load_sub_device_manager(sub_device_manager);
+
+        auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(device, sub_device_1, sub_device_2);
+
+        // Compile the programs
+        EnqueueProgram(device->command_queue(), waiter_program, false);
+        // Test blocking on one sub-device
+        EnqueueProgram(device->command_queue(), syncer_program, true);
+        EnqueueProgram(device->command_queue(), incrementer_program, false);
+        Synchronize(device);
+
+        // Capture the trace
+        auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), waiter_program, false);
+        EnqueueProgram(device->command_queue(), syncer_program, false);
+        EnqueueProgram(device->command_queue(), incrementer_program, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_1);
+
+        auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), syncer_program, false);
+        EnqueueProgram(device->command_queue(), incrementer_program, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_2);
+
+        for (uint32_t i = 0; i < num_iters; i++) {
+            // Regular program execution
+            EnqueueProgram(device->command_queue(), waiter_program, false);
+            // Test blocking on one sub-device
+            EnqueueProgram(device->command_queue(), syncer_program, true);
+            EnqueueProgram(device->command_queue(), incrementer_program, false);
+
+            // Full trace execution
+            ReplayTrace(device, device->command_queue().id(), tid_1, false);
+
+            // Partial trace execution
+            EnqueueProgram(device->command_queue(), waiter_program, false);
+            ReplayTrace(device, device->command_queue().id(), tid_2, false);
+        }
+        Synchronize(device);
+    }
+}
+
+TEST_F(CommandQueueSingleCardTraceFixture, TensixActiveEthTestSubDeviceTraceBasicEthPrograms) {
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
+    uint32_t num_iters = 5;
+    for (Device *device : devices_) {
+        if (!does_device_have_active_eth_cores(device)) {
+            GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores";
+        }
+        auto eth_core = *device->get_active_ethernet_cores(true).begin();
+        SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})}), CoreRangeSet(CoreRange(eth_core, eth_core))});
+        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
+        device->load_sub_device_manager(sub_device_manager);
+
+        auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_eth_sync_program(device, sub_device_1, sub_device_2);
+
+        // Compile the programs
+        EnqueueProgram(device->command_queue(), waiter_program, false);
+        // Test blocking on one sub-device
+        EnqueueProgram(device->command_queue(), syncer_program, true);
+        EnqueueProgram(device->command_queue(), incrementer_program, false);
+        Synchronize(device);
+
+        // Capture the trace
+        auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), waiter_program, false);
+        EnqueueProgram(device->command_queue(), syncer_program, false);
+        EnqueueProgram(device->command_queue(), incrementer_program, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_1);
+
+        auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), syncer_program, false);
+        EnqueueProgram(device->command_queue(), incrementer_program, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_2);
+
+        for (uint32_t i = 0; i < num_iters; i++) {
+            // Regular program execution
+            EnqueueProgram(device->command_queue(), waiter_program, false);
+            // Test blocking on one sub-device
+            EnqueueProgram(device->command_queue(), syncer_program, true);
+            EnqueueProgram(device->command_queue(), incrementer_program, false);
+
+            // Full trace execution
+            ReplayTrace(device, device->command_queue().id(), tid_1, false);
+
+            // Partial trace execution
+            EnqueueProgram(device->command_queue(), waiter_program, false);
+            ReplayTrace(device, device->command_queue().id(), tid_2, false);
+        }
+        Synchronize(device);
+    }
+}
+
+TEST_F(CommandQueueSingleCardTraceFixture, TensixActiveEthTestSubDeviceTraceProgramsReconfigureSubDevices) {
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
+    SubDevice sub_device_2(std::array{CoreRangeSet(std::array{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
+    SubDevice sub_device_3(std::array{CoreRangeSet(std::array{CoreRange({2, 4}, {3, 4}), CoreRange({5, 1}, {6, 3})})});
+    uint32_t num_iters = 5;
+    for (Device *device : devices_) {
+        if (!does_device_have_active_eth_cores(device)) {
+            GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores";
+        }
+        auto eth_core = *device->get_active_ethernet_cores(true).begin();
+        SubDevice sub_device_4(std::array{CoreRangeSet(std::array{CoreRange({2, 1}, {2, 2}), CoreRange({1, 5}, {5, 5})}), CoreRangeSet(CoreRange(eth_core, eth_core))});
+
+        auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
+        auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_3, sub_device_4}, 3200);
+
+        device->load_sub_device_manager(sub_device_manager_1);
+
+        auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2);
+
+        // Compile the programs
+        EnqueueProgram(device->command_queue(), waiter_program_1, false);
+        EnqueueProgram(device->command_queue(), syncer_program_1, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
+        Synchronize(device);
+
+        // Capture the trace
+        auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), waiter_program_1, false);
+        EnqueueProgram(device->command_queue(), syncer_program_1, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_1);
+
+        auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), syncer_program_1, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_2);
+
+        device->load_sub_device_manager(sub_device_manager_2);
+
+        auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_eth_sync_program(device, sub_device_3, sub_device_4);
+
+        // Compile the programs
+        EnqueueProgram(device->command_queue(), waiter_program_2, false);
+        EnqueueProgram(device->command_queue(), syncer_program_2, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
+        Synchronize(device);
+
+        // Capture the trace
+        auto tid_3 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), waiter_program_2, false);
+        EnqueueProgram(device->command_queue(), syncer_program_2, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_3);
+
+        auto tid_4 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), syncer_program_2, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_4);
+
+        for (uint32_t i = 0; i < num_iters; i++) {
+            device->load_sub_device_manager(sub_device_manager_1);
+            // Regular program execution
+            EnqueueProgram(device->command_queue(), waiter_program_1, false);
+            // Test blocking on one sub-device
+            EnqueueProgram(device->command_queue(), syncer_program_1, false);
+            EnqueueProgram(device->command_queue(), incrementer_program_1, false);
+
+            // Full trace execution
+            ReplayTrace(device, device->command_queue().id(), tid_1, false);
+
+            // Partial trace execution
+            EnqueueProgram(device->command_queue(), waiter_program_1, false);
+            ReplayTrace(device, device->command_queue().id(), tid_2, false);
+
+            device->load_sub_device_manager(sub_device_manager_2);
+            // Regular program execution
+            EnqueueProgram(device->command_queue(), waiter_program_2, false);
+            // Test blocking on one sub-device
+            EnqueueProgram(device->command_queue(), syncer_program_2, false);
+            EnqueueProgram(device->command_queue(), incrementer_program_2, false);
+
+            // Full trace execution
+            ReplayTrace(device, device->command_queue().id(), tid_3, false);
+
+            // Partial trace execution
+            EnqueueProgram(device->command_queue(), waiter_program_2, false);
+            ReplayTrace(device, device->command_queue().id(), tid_4, false);
+        }
+        Synchronize(device);
+    }
+}
+
+TEST_F(CommandQueueSingleCardTraceFixture, TensixTestSubDeviceIllegalOperations) {
+    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
+    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
+
+    // Assert no idle eth cores specified
+    EXPECT_THROW(SubDevice sub_device_3(std::array{CoreRangeSet(CoreRange({3, 3}, {3, 3})), CoreRangeSet(CoreRange({4, 4}, {4, 4})), CoreRangeSet(CoreRange({5, 5}, {5, 5}))}), std::exception);
+    for (Device *device : devices_) {
+        auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
+        auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_2, sub_device_1}, 3200);
+        device->load_sub_device_manager(sub_device_manager_1);
+
+        auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2);
+
+        // Compile the programs
+        EnqueueProgram(device->command_queue(), waiter_program_1, false);
+        // Test blocking on one sub-device
+        EnqueueProgram(device->command_queue(), syncer_program_1, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
+        Synchronize(device);
+
+        // Capture the trace
+        auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
+        // Can not load a sub-device manager while tracing
+        EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception);
+        EnqueueProgram(device->command_queue(), waiter_program_1, false);
+        EnqueueProgram(device->command_queue(), syncer_program_1, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_1);
+
+        device->load_sub_device_manager(sub_device_manager_2);
+        auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_sync_program(device, sub_device_2, sub_device_1);
+
+        EnqueueProgram(device->command_queue(), waiter_program_2, false);
+        EnqueueProgram(device->command_queue(), syncer_program_2, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
+        Synchronize(device);
+
+        auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
+        EnqueueProgram(device->command_queue(), waiter_program_2, false);
+        EnqueueProgram(device->command_queue(), syncer_program_2, false);
+        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
+        EndTraceCapture(device, device->command_queue().id(), tid_2);
+
+        // Regular program execution
+        // Can not run a program on a different sub-device manager
+        EXPECT_THROW(EnqueueProgram(device->command_queue(), waiter_program_1, false), std::exception);
+
+        // Full trace execution
+        ReplayTrace(device, device->command_queue().id(), tid_2, false);
+
+        // Can not replay a trace on a different sub-device manager
+        EXPECT_THROW(ReplayTrace(device, device->command_queue().id(), tid_1, false), std::exception);
+
+        Synchronize(device);
+
+        device->remove_sub_device_manager(sub_device_manager_1);
+        EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_1), std::exception);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp
new file mode 100644
index 00000000000..d5d6326cc79
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/multi_command_queue_fixture.hpp
@@ -0,0 +1,150 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "gtest/gtest.h"
+#include "dispatch_fixture.hpp"
+#include "hostdevcommon/common_values.hpp"
+#include "impl/device/device.hpp"
+#include "llrt/hal.hpp"
+#include "tt_cluster_descriptor_types.h"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/test_utils/env_vars.hpp"
+#include "tt_metal/impl/kernels/kernel.hpp"
+#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "tt_metal/llrt/rtoptions.hpp"
+
+class MultiCommandQueueSingleDeviceFixture : public DispatchFixture {
+   protected:
+    void SetUp() override {
+        this->validate_dispatch_mode();
+
+        this->num_cqs_ = tt::llrt::OptionsG.get_num_hw_cqs();
+        if (this->num_cqs_ != 2) {
+            tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2");
+            GTEST_SKIP();
+        }
+
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+
+        const chip_id_t device_id = 0;
+        const DispatchCoreType dispatch_core_type = this->get_dispatch_core_type();
+        this->create_device(device_id, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+    }
+
+    void TearDown() override {
+        if (this->device_ != nullptr) {
+            tt::tt_metal::CloseDevice(this->device_);
+        }
+    }
+
+    void validate_dispatch_mode() {
+        this->slow_dispatch_ = false;
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (slow_dispatch) {
+            tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
+            this->slow_dispatch_ = true;
+            GTEST_SKIP();
+        }
+    }
+
+    DispatchCoreType get_dispatch_core_type() {
+        DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER;
+        if (this->arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) {
+            if (!tt::tt_metal::IsGalaxyCluster()) {
+                tt::log_warning(
+                    tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in SetUp()");
+                dispatch_core_type = DispatchCoreType::ETH;
+            }
+        }
+        return dispatch_core_type;
+    }
+
+    void create_device(
+        const chip_id_t device_id,
+        const size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE,
+        const DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER) {
+        this->device_ = tt::tt_metal::CreateDevice(
+            device_id, this->num_cqs_, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_type);
+    }
+
+    tt::tt_metal::Device *device_ = nullptr;
+    tt::ARCH arch_;
+    uint8_t num_cqs_;
+};
+
+class MultiCommandQueueSingleDeviceEventFixture : public MultiCommandQueueSingleDeviceFixture {};
+
+class MultiCommandQueueSingleDeviceBufferFixture : public MultiCommandQueueSingleDeviceFixture {};
+
+class MultiCommandQueueSingleDeviceProgramFixture : public MultiCommandQueueSingleDeviceFixture {};
+
+class MultiCommandQueueSingleDeviceTraceFixture : public MultiCommandQueueSingleDeviceFixture {
+    protected:
+    void SetUp() override {
+        this->validate_dispatch_mode();
+
+        this->num_cqs_ = tt::llrt::OptionsG.get_num_hw_cqs();
+        if (this->num_cqs_ != 2) {
+            tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2");
+            GTEST_SKIP();
+        }
+
+        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+    }
+
+    void CreateDevice(const size_t trace_region_size) {
+        const chip_id_t device_id = 0;
+        const DispatchCoreType dispatch_core_type = this->get_dispatch_core_type();
+        this->create_device(device_id, trace_region_size, dispatch_core_type);
+    }
+
+    DispatchCoreType dispatch_core_type_;
+};
+
+class MultiCommandQueueMultiDeviceFixture : public DispatchFixture {
+   protected:
+    void SetUp() override {
+        this->slow_dispatch_ = false;
+        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
+        if (slow_dispatch) {
+            tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
+            this->slow_dispatch_ = true;
+            GTEST_SKIP();
+        }
+
+        auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs();
+        if (num_cqs != 2) {
+            tt::log_info(tt::LogTest, "This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2");
+            GTEST_SKIP();
+        }
+
+        const tt::ARCH arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+
+        DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER;
+        if (arch == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) {
+            if (!tt::tt_metal::IsGalaxyCluster()) {
+                tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()");
+                dispatch_core_type = DispatchCoreType::ETH;
+            }
+        }
+
+        const chip_id_t mmio_device_id = 0;
+        reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
+        for (const auto &[id, device] : reserved_devices_) {
+            devices_.push_back(device);
+        }
+    }
+
+    void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }
+
+    std::vector<tt::tt_metal::Device*> devices_;
+    std::map<chip_id_t, tt::tt_metal::Device*> reserved_devices_;
+};
+
+class MultiCommandQueueMultiDeviceBufferFixture : public MultiCommandQueueMultiDeviceFixture {};
+
+class MultiCommandQueueMultiDeviceEventFixture : public MultiCommandQueueMultiDeviceFixture {};
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp
similarity index 72%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp
rename to tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp
index b5efa2e0729..02900995280 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp
+++ b/tests/tt_metal/tt_metal/dispatch/random_program_fixture.hpp
@@ -1,178 +1,19 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
-#include <algorithm>
-#include <cstdint>
-#include <variant>
-#include <vector>
-#include "common/core_coord.hpp"
-#include "common/env_lib.hpp"
-#include "gtest/gtest.h"
-#include "hostdevcommon/common_values.hpp"
-#include "impl/buffers/circular_buffer_types.hpp"
+#include "command_queue_fixture.hpp"
 #include "impl/device/device.hpp"
-#include "impl/kernels/data_types.hpp"
-#include "impl/kernels/kernel_types.hpp"
-#include "impl/dispatch/command_queue.hpp"
 #include "llrt/hal.hpp"
-#include "tt_cluster_descriptor_types.h"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
 #include "tt_metal/impl/kernels/kernel.hpp"
 #include "tt_metal/common/tt_backend_api_types.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-#include "tt_metal/tt_metal/unit_tests_common/common/test_utils.hpp"
-#include "tt_soc_descriptor.h"
+#include "dispatch_test_utils.hpp"
 
-class CommandQueueFixture : public ::testing::Test {
-   protected:
-    tt::ARCH arch_;
-    tt::tt_metal::Device* device_;
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
-            GTEST_SKIP();
-        }
-        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-
-        const int device_id = 0;
-
-        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-        this->device_ = tt::tt_metal::CreateDevice(device_id, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    }
-
-    void TearDown() override {
-        if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
-            tt::tt_metal::CloseDevice(this->device_);
-        }
-    }
-};
-
-
-class CommandQueueMultiDeviceFixture : public ::testing::Test {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
-            GTEST_SKIP();
-        }
-        arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-
-        num_devices_ = tt::tt_metal::GetNumAvailableDevices();
-        if (num_devices_ < 2 ) {
-            GTEST_SKIP();
-        }
-        std::vector<chip_id_t> chip_ids;
-        for (unsigned int id = 0; id < num_devices_; id++) {
-            chip_ids.push_back(id);
-        }
-
-        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-        reserved_devices_ = tt::tt_metal::detail::CreateDevices(chip_ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-        for (const auto &[id, device] : reserved_devices_) {
-            devices_.push_back(device);
-        }
-    }
-
-    void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }
-
-    std::vector<tt::tt_metal::Device*> devices_;
-    std::map<chip_id_t, tt::tt_metal::Device*> reserved_devices_;
-    tt::ARCH arch_;
-    size_t num_devices_;
-};
-
-class CommandQueueSingleCardFixture : public ::testing::Test {
-   protected:
-    void SetUp() override {
-        this->validate_dispatch_mode();
-        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-        this->create_devices();
-    }
-
-    void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }
-
-    void validate_dispatch_mode() {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
-            GTEST_SKIP();
-        }
-    }
-
-    void create_devices(const std::size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE) {
-        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-        const chip_id_t mmio_device_id = 0;
-        this->reserved_devices_ = tt::tt_metal::detail::CreateDevices(
-            {mmio_device_id}, 1, DEFAULT_L1_SMALL_SIZE, trace_region_size, dispatch_core_type);
-        auto enable_remote_chip = getenv("TT_METAL_ENABLE_REMOTE_CHIP");
-        if (enable_remote_chip) {
-            for (const auto &[id, device] : this->reserved_devices_) {
-                this->devices_.push_back(device);
-            }
-        } else {
-            this->devices_.push_back(this->reserved_devices_.at(mmio_device_id));
-        }
-
-        this->num_devices_ = this->reserved_devices_.size();
-    }
-
-    std::vector<tt::tt_metal::Device *> devices_;
-    std::map<chip_id_t, tt::tt_metal::Device *> reserved_devices_;
-    tt::ARCH arch_;
-    size_t num_devices_;
-};
-
-class CommandQueueSingleCardTraceFixture : virtual public CommandQueueSingleCardFixture {
-    protected:
-     void SetUp() override {
-        this->validate_dispatch_mode();
-        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-        this->create_devices(90000000);
-     }
-};
-
-class SingleDeviceTraceFixture: public ::testing::Test {
-protected:
-    tt::tt_metal::Device* device_;
-    tt::ARCH arch_;
-
-    void Setup(const size_t buffer_size, const uint8_t num_hw_cqs = 1) {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
-            GTEST_SKIP();
-        }
-        if (num_hw_cqs > 1) {
-            // Running multi-CQ test. User must set this explicitly.
-            auto num_cqs = getenv("TT_METAL_GTEST_NUM_HW_CQS");
-            if (num_cqs == nullptr or strcmp(num_cqs, "2")) {
-                TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2");
-                GTEST_SKIP();
-            }
-        }
-        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-        const int device_id = 0;
-        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-        const chip_id_t mmio_device_id = 0;
-        this->device_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}, 1, DEFAULT_L1_SMALL_SIZE, buffer_size, dispatch_core_type).at(mmio_device_id);
-    }
-
-    void TearDown() override {
-        if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")) {
-            tt::tt_metal::CloseDevice(this->device_);
-        }
-    }
-
-};
-
-class RandomProgramFixture : virtual public CommandQueueSingleCardFixture {
+class RandomProgramFixture : virtual public CommandQueueSingleCardProgramFixture {
    protected:
     static const uint32_t MIN_KERNEL_SIZE_BYTES = 20;
     static const uint32_t MAX_KERNEL_SIZE_BYTES = 4096;
@@ -225,7 +66,7 @@ class RandomProgramFixture : virtual public CommandQueueSingleCardFixture {
     Device *device_;
 
     void SetUp() override {
-        CommandQueueSingleCardFixture::SetUp();
+        CommandQueueSingleCardProgramFixture::SetUp();
         this->device_ = this->devices_[0];
         this->initialize_seed();
     }
@@ -510,7 +351,7 @@ class RandomProgramFixture : virtual public CommandQueueSingleCardFixture {
     }
 };
 
-class RandomProgramTraceFixture : public RandomProgramFixture, public CommandQueueSingleCardTraceFixture {
+class RandomProgramTraceFixture : virtual public RandomProgramFixture, virtual public CommandQueueSingleCardTraceFixture {
    protected:
     static const uint32_t NUM_TRACE_ITERATIONS = 50;
     Program programs[NUM_PROGRAMS];
diff --git a/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp
new file mode 100644
index 00000000000..fd02ab47296
--- /dev/null
+++ b/tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp
@@ -0,0 +1,121 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "host_api.hpp"
+
+inline std::tuple<Program, CoreCoord, std::unique_ptr<GlobalSemaphore>> create_single_sync_program(
+    Device* device, SubDevice sub_device) {
+    auto syncer_coord = sub_device.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord;
+    auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord));
+    auto global_sem = CreateGlobalSemaphore(device, sub_device.cores(HalProgrammableCoreType::TENSIX), INVALID);
+
+    Program syncer_program = CreateProgram();
+    auto syncer_kernel = CreateKernel(
+        syncer_program,
+        "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp",
+        syncer_core,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
+    std::array<uint32_t, 1> syncer_rt_args = {global_sem->address()};
+    SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args);
+    return {std::move(syncer_program), std::move(syncer_coord), std::move(global_sem)};
+}
+
+inline std::tuple<Program, Program, Program, std::unique_ptr<GlobalSemaphore>> create_basic_sync_program(
+    Device* device, const SubDevice& sub_device_1, const SubDevice& sub_device_2) {
+    auto waiter_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord;
+    auto waiter_core = CoreRangeSet(CoreRange(waiter_coord, waiter_coord));
+    auto waiter_core_physical = device->worker_core_from_logical_core(waiter_coord);
+    auto incrementer_cores = sub_device_1.cores(HalProgrammableCoreType::TENSIX);
+    auto syncer_coord = incrementer_cores.ranges().back().end_coord;
+    auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord));
+    auto syncer_core_physical = device->worker_core_from_logical_core(syncer_coord);
+    auto all_cores = waiter_core.merge(incrementer_cores).merge(syncer_core);
+    auto global_sem = CreateGlobalSemaphore(device, all_cores, INVALID);
+
+    Program waiter_program = CreateProgram();
+    auto waiter_kernel = CreateKernel(
+        waiter_program,
+        "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_waiter.cpp",
+        waiter_core,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
+    std::array<uint32_t, 4> waiter_rt_args = {
+        global_sem->address(), incrementer_cores.num_cores(), syncer_core_physical.x, syncer_core_physical.y};
+    SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args);
+
+    Program syncer_program = CreateProgram();
+    auto syncer_kernel = CreateKernel(
+        syncer_program,
+        "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp",
+        syncer_core,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
+    std::array<uint32_t, 1> syncer_rt_args = {global_sem->address()};
+    SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args);
+
+    Program incrementer_program = CreateProgram();
+    auto incrementer_kernel = CreateKernel(
+        incrementer_program,
+        "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/incrementer.cpp",
+        incrementer_cores,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
+    std::array<uint32_t, 3> incrementer_rt_args = {
+        global_sem->address(), waiter_core_physical.x, waiter_core_physical.y};
+    SetRuntimeArgs(incrementer_program, incrementer_kernel, incrementer_cores, incrementer_rt_args);
+    return {
+        std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)};
+}
+
+inline std::tuple<Program, Program, Program, std::unique_ptr<GlobalSemaphore>> create_basic_eth_sync_program(
+    Device* device, const SubDevice& sub_device_1, const SubDevice& sub_device_2) {
+    auto waiter_coord = sub_device_2.cores(HalProgrammableCoreType::ACTIVE_ETH).ranges().at(0).start_coord;
+    auto waiter_core = CoreRangeSet(CoreRange(waiter_coord, waiter_coord));
+    auto waiter_core_physical = device->ethernet_core_from_logical_core(waiter_coord);
+    auto tensix_waiter_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord;
+    auto tensix_waiter_core = CoreRangeSet(CoreRange(tensix_waiter_coord, tensix_waiter_coord));
+    auto tensix_waiter_core_physical = device->worker_core_from_logical_core(tensix_waiter_coord);
+    auto incrementer_cores = sub_device_1.cores(HalProgrammableCoreType::TENSIX);
+    auto syncer_coord = incrementer_cores.ranges().back().end_coord;
+    auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord));
+    auto syncer_core_physical = device->worker_core_from_logical_core(syncer_coord);
+    auto all_cores = tensix_waiter_core.merge(incrementer_cores).merge(syncer_core);
+    auto global_sem = CreateGlobalSemaphore(device, all_cores, INVALID);
+
+    Program waiter_program = CreateProgram();
+    auto waiter_kernel = CreateKernel(
+        waiter_program,
+        "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_remote_waiter.cpp",
+        waiter_core,
+        EthernetConfig{.noc = NOC::RISCV_0_default, .processor = DataMovementProcessor::RISCV_0});
+    std::array<uint32_t, 7> waiter_rt_args = {
+        global_sem->address(),
+        incrementer_cores.num_cores(),
+        syncer_core_physical.x,
+        syncer_core_physical.y,
+        tensix_waiter_core_physical.x,
+        tensix_waiter_core_physical.y,
+        eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE};
+    SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args);
+
+    Program syncer_program = CreateProgram();
+    auto syncer_kernel = CreateKernel(
+        syncer_program,
+        "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp",
+        syncer_core,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
+    std::array<uint32_t, 1> syncer_rt_args = {global_sem->address()};
+    SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args);
+
+    Program incrementer_program = CreateProgram();
+    auto incrementer_kernel = CreateKernel(
+        incrementer_program,
+        "tests/tt_metal/tt_metal/test_kernels/misc/sub_device/incrementer.cpp",
+        incrementer_cores,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
+    std::array<uint32_t, 3> incrementer_rt_args = {
+        global_sem->address(), tensix_waiter_core_physical.x, tensix_waiter_core_physical.y};
+    SetRuntimeArgs(incrementer_program, incrementer_kernel, incrementer_cores, incrementer_rt_args);
+    return {
+        std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)};
+}
diff --git a/tests/tt_metal/tt_metal/eth/CMakeLists.txt b/tests/tt_metal/tt_metal/eth/CMakeLists.txt
new file mode 100644
index 00000000000..633c597f9f5
--- /dev/null
+++ b/tests/tt_metal/tt_metal/eth/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(UNIT_TESTS_ETH_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_basic_eth.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_buffer_movement_kernels.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_erisc_app_direct_send.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_ring_gather_kernels.cpp
+)
+
+add_executable(unit_tests_eth ${UNIT_TESTS_ETH_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_eth)
+
+target_link_libraries(unit_tests_eth PUBLIC test_metal_common_libs)
+target_include_directories(
+    unit_tests_eth
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+set_target_properties(
+    unit_tests_eth
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
diff --git a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp
new file mode 100644
index 00000000000..8a871c3dd87
--- /dev/null
+++ b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp
@@ -0,0 +1,454 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include "command_queue_fixture.hpp"
+#include "device_fixture.hpp"
+#include "dispatch_fixture.hpp"
+#include "multi_device_fixture.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "host_api.hpp"
+#include "tt_metal/impl/kernels/kernel.hpp"
+#include "tt_metal/test_utils/stimulus.hpp"
+
+using namespace tt;
+using namespace tt::test_utils;
+// using namespace tt::test_utils::df;
+
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
+constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
+constexpr std::int32_t MAX_NUM_WORDS =
+    (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE;
+}
+}
+
+namespace unit_tests::erisc::kernels {
+
+
+
+/*
+ *                                         ███╗░░██╗░█████╗░░█████╗░
+ *                                         ████╗░██║██╔══██╗██╔══██╗
+ *                                         ██╔██╗██║██║░░██║██║░░╚═╝
+ *                                         ██║╚████║██║░░██║██║░░██╗
+ *                                         ██║░╚███║╚█████╔╝╚█████╔╝
+ *                                         ╚═╝░░╚══╝░╚════╝░░╚════╝░
+ */
+
+bool reader_kernel_no_send(
+    DispatchFixture* fixture,
+    tt_metal::Device* device,
+    const size_t& byte_size,
+    const size_t& eth_l1_byte_address,
+    const CoreCoord& eth_reader_core,
+    const tt_metal::EthernetConfig &ethernet_config = tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}) {
+    bool pass = true;
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Application Setup
+    ////////////////////////////////////////////////////////////////////////////
+    tt_metal::Program program = tt_metal::Program();
+
+    tt::tt_metal::InterleavedBufferConfig dram_config{
+                    .device=device,
+                    .size = byte_size,
+                    .page_size = byte_size,
+                    .buffer_type = tt::tt_metal::BufferType::DRAM
+        };
+
+    auto input_dram_buffer = CreateBuffer(dram_config);
+    uint32_t dram_byte_address = input_dram_buffer->address();
+    auto dram_noc_xy = input_dram_buffer->noc_coordinates();
+    auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core);
+    log_debug(
+        tt::LogTest,
+        "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}",
+        device->id(),
+        byte_size,
+        dram_noc_xy.str(),
+        dram_byte_address,
+        eth_reader_core.str(),
+        eth_l1_byte_address);
+
+    auto eth_reader_kernel = tt_metal::CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp",
+        eth_reader_core,
+        ethernet_config);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Compile and Execute Application
+    ////////////////////////////////////////////////////////////////////////////
+
+    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
+    fixture->WriteBuffer(device, input_dram_buffer, inputs);
+
+    // Clear expected value at ethernet L1 address
+    std::vector<uint32_t> all_zeros(inputs.size(), 0);
+    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_l1_byte_address);
+
+    tt_metal::SetRuntimeArgs(
+        program,
+        eth_reader_kernel,
+        eth_reader_core,
+        {
+            (uint32_t)dram_byte_address,
+            (uint32_t)dram_noc_xy.x,
+            (uint32_t)dram_noc_xy.y,
+            (uint32_t)byte_size,
+            (uint32_t)eth_l1_byte_address,
+        });
+
+    fixture->RunProgram(device, program);
+
+    auto readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_l1_byte_address, byte_size);
+    pass &= (readback_vec == inputs);
+    if (not pass) {
+        std::cout << "Mismatch at Core: " << eth_noc_xy.str() << std::endl;
+    }
+    return pass;
+}
+
+bool writer_kernel_no_receive(
+    DispatchFixture* fixture,
+    tt_metal::Device* device,
+    const size_t& byte_size,
+    const size_t& eth_l1_byte_address,
+    const CoreCoord& eth_writer_core,
+    const tt_metal::EthernetConfig &ethernet_config = tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}) {
+    bool pass = true;
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Application Setup
+    ////////////////////////////////////////////////////////////////////////////
+    tt_metal::Program program = tt_metal::Program();
+
+    tt::tt_metal::InterleavedBufferConfig dram_config{
+                    .device=device,
+                    .size = byte_size,
+                    .page_size = byte_size,
+                    .buffer_type = tt::tt_metal::BufferType::DRAM
+        };
+
+    auto output_dram_buffer = CreateBuffer(dram_config);
+    uint32_t dram_byte_address = output_dram_buffer->address();
+    auto dram_noc_xy = output_dram_buffer->noc_coordinates();
+    auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core);
+    log_debug(
+        tt::LogTest,
+        "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}",
+        device->id(),
+        byte_size,
+        eth_writer_core.str(),
+        eth_l1_byte_address,
+        dram_noc_xy.str(),
+        dram_byte_address);
+
+    auto eth_writer_kernel = tt_metal::CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp",
+        eth_writer_core,
+        ethernet_config);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Compile and Execute Application
+    ////////////////////////////////////////////////////////////////////////////
+
+    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
+    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, inputs, eth_l1_byte_address);
+
+    // Clear expected value at ethernet L1 address
+    std::vector<uint32_t> all_zeros(inputs.size(), 0);
+    fixture->WriteBuffer(device, output_dram_buffer, all_zeros);
+
+    tt_metal::SetRuntimeArgs(
+        program,
+        eth_writer_kernel,
+        eth_writer_core,
+        {
+            (uint32_t)dram_byte_address,
+            (uint32_t)dram_noc_xy.x,
+            (uint32_t)dram_noc_xy.y,
+            (uint32_t)byte_size,
+            (uint32_t)eth_l1_byte_address,
+        });
+
+    fixture->RunProgram(device, program);
+
+    auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size);
+    pass &= (readback_vec == inputs);
+    if (not pass) {
+        std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl;
+    }
+    return pass;
+}
+
+bool noc_reader_and_writer_kernels(
+    tt_metal::Device *device,
+    const uint32_t byte_size,
+    const uint32_t eth_dst_l1_address,
+    const uint32_t eth_src_l1_address,
+    const CoreCoord &logical_eth_core,
+    const tt_metal::EthernetConfig &reader_eth_config,
+    const tt_metal::EthernetConfig &writer_eth_config) {
+    bool pass = true;
+
+    tt_metal::Program program = tt_metal::Program();
+
+    tt_metal::InterleavedBufferConfig dram_config{
+        .device=device,
+        .size = byte_size,
+        .page_size = byte_size,
+        .buffer_type = tt_metal::BufferType::DRAM
+    };
+
+    auto reader_dram_buffer = CreateBuffer(dram_config);
+    auto writer_dram_buffer = CreateBuffer(dram_config);
+
+    auto reader_dram_noc_xy = reader_dram_buffer->noc_coordinates();
+    auto writer_dram_noc_xy = writer_dram_buffer->noc_coordinates();
+
+    log_debug(
+        tt::LogTest,
+        "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}",
+        device->id(),
+        byte_size,
+        reader_dram_noc_xy.str(),
+        reader_dram_buffer->address(),
+        logical_eth_core.str(),
+        eth_dst_l1_address);
+    log_debug(
+        tt::LogTest,
+        "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}",
+        device->id(),
+        byte_size,
+        logical_eth_core.str(),
+        eth_src_l1_address,
+        writer_dram_noc_xy.str(),
+        writer_dram_buffer->address());
+
+    auto eth_noc_xy = device->ethernet_core_from_logical_core(logical_eth_core);
+
+    auto eth_reader_kernel = tt_metal::CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp",
+        logical_eth_core,
+        reader_eth_config);
+
+    tt_metal::SetRuntimeArgs(
+        program,
+        eth_reader_kernel,
+        logical_eth_core,
+        {
+            (uint32_t)reader_dram_buffer->address(),
+            (uint32_t)reader_dram_noc_xy.x,
+            (uint32_t)reader_dram_noc_xy.y,
+            (uint32_t)byte_size,
+            (uint32_t)eth_dst_l1_address,
+        });
+
+    auto eth_writer_kernel = tt_metal::CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp",
+        logical_eth_core,
+        writer_eth_config);
+
+    tt_metal::SetRuntimeArgs(
+        program,
+        eth_writer_kernel,
+        logical_eth_core,
+        {
+            (uint32_t)writer_dram_buffer->address(),
+            (uint32_t)writer_dram_noc_xy.x,
+            (uint32_t)writer_dram_noc_xy.y,
+            (uint32_t)byte_size,
+            (uint32_t)eth_src_l1_address,
+        });
+
+    auto reader_inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
+    tt_metal::detail::WriteToBuffer(reader_dram_buffer, reader_inputs);
+
+    auto writer_inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
+    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, writer_inputs, eth_src_l1_address);
+
+    // Clear expected values at output locations
+    std::vector<uint32_t> all_zeros(byte_size / sizeof(uint32_t), 0);
+    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_dst_l1_address);
+    tt_metal::detail::WriteToBuffer(writer_dram_buffer, all_zeros);
+
+    tt_metal::detail::LaunchProgram(device, program);
+
+    auto eth_readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_dst_l1_address, byte_size);
+    pass &= (eth_readback_vec == reader_inputs);
+    if (not pass) {
+        log_info(tt::LogTest, "Mismatch at eth core: {}, eth kernel read incorrect values from DRAM", logical_eth_core.str());
+    }
+    std::vector<uint32_t> dram_readback_vec;
+    tt_metal::detail::ReadFromBuffer(writer_dram_buffer, dram_readback_vec);
+    pass &= (dram_readback_vec == writer_inputs);
+    if (not pass) {
+        log_info(tt::LogTest, "Mismatch at eth core: {}, eth kernel wrote incorrect values to DRAM", logical_eth_core.str());
+    }
+
+    return pass;
+}
+
+} // namespace unit_tests::erisc::kernels
+
+TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthKernelsNocReadNoSend) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+
+    for (const auto& device : devices_) {
+        for (const auto& eth_core : device->get_active_ethernet_cores(true)) {
+            ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+                static_cast<DispatchFixture*>(this), device, WORD_SIZE, src_eth_l1_byte_address, eth_core));
+            ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+                static_cast<DispatchFixture*>(this), device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
+            ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+                static_cast<DispatchFixture*>(this), device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
+        }
+    }
+}
+
+TEST_F(CommandQueueSingleCardProgramFixture, ActiveEthKernelsNocWriteNoReceive) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+
+    for (const auto& device : devices_) {
+        for (const auto& eth_core : device->get_active_ethernet_cores(true)) {
+            ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+                static_cast<DispatchFixture*>(this), device, WORD_SIZE, src_eth_l1_byte_address, eth_core));
+            ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+                static_cast<DispatchFixture*>(this), device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
+            ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+                static_cast<DispatchFixture*>(this), device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
+        }
+    }
+}
+
+TEST_F(N300DeviceFixture, ActiveEthKernelsNocReadNoSend) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    GTEST_SKIP();
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+
+    for (const auto& eth_core : device_0->get_active_ethernet_cores(true)) {
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_0, WORD_SIZE, src_eth_l1_byte_address, eth_core));
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_0, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_0, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
+    }
+
+    for (const auto& eth_core : device_1->get_active_ethernet_cores(true)) {
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_1, WORD_SIZE, src_eth_l1_byte_address, eth_core));
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_1, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_1, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
+    }
+}
+
+TEST_F(N300DeviceFixture, ActiveEthKernelsNocWriteNoReceive) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    GTEST_SKIP();
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+
+    for (const auto& eth_core : device_0->get_active_ethernet_cores(true)) {
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_0, WORD_SIZE, src_eth_l1_byte_address, eth_core));
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_0, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_0, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
+    }
+
+    for (const auto& eth_core : device_1->get_active_ethernet_cores(true)) {
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_1, WORD_SIZE, src_eth_l1_byte_address, eth_core));
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_1, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_1, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
+    }
+}
+
+/*
+ *
+ *                                         ███████╗████████╗██╗░░██╗
+ *                                         ██╔════╝╚══██╔══╝██║░░██║
+ *                                         █████╗░░░░░██║░░░███████║
+ *                                         ██╔══╝░░░░░██║░░░██╔══██║
+ *                                         ███████╗░░░██║░░░██║░░██║
+ *                                         ╚══════╝░░░╚═╝░░░╚═╝░░╚═╝
+ */
+
+
+
+
+
+// TODO #14640: Run this on WH when i$ flush issue is addressed
+TEST_F(BlackholeSingleCardFixture, IdleEthKernelOnIdleErisc0) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    uint32_t eth_l1_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED);
+    tt_metal::EthernetConfig noc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_0};
+    tt_metal::EthernetConfig noc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_1, .processor = tt_metal::DataMovementProcessor::RISCV_0};
+
+    for (const auto& eth_core : device_->get_inactive_ethernet_cores()) {
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config));
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config));
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config));
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config));
+    }
+}
+
+TEST_F(BlackholeSingleCardFixture, IdleEthKernelOnIdleErisc1) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    uint32_t eth_l1_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED);
+    tt_metal::EthernetConfig noc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_1};
+    tt_metal::EthernetConfig noc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_1, .processor = tt_metal::DataMovementProcessor::RISCV_1};
+
+    for (const auto& eth_core : device_->get_inactive_ethernet_cores()) {
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config));
+        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
+            static_cast<DispatchFixture*>(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config));
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config));
+        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
+            static_cast<DispatchFixture*>(this), device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config));
+    }
+}
+
+TEST_F(BlackholeSingleCardFixture, IdleEthKernelOnBothIdleEriscs) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    uint32_t read_write_size_bytes = WORD_SIZE * 2048;
+    uint32_t reader_dst_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED);
+    uint32_t writer_src_address = reader_dst_address + read_write_size_bytes;
+    tt_metal::EthernetConfig erisc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_0};
+    tt_metal::EthernetConfig erisc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_1};
+
+    for (const auto& eth_core : device_->get_inactive_ethernet_cores()) {
+        ASSERT_TRUE(unit_tests::erisc::kernels::noc_reader_and_writer_kernels(
+            device_, read_write_size_bytes, reader_dst_address, writer_src_address, eth_core, erisc0_ethernet_config, erisc1_ethernet_config
+        ));
+        erisc0_ethernet_config.noc = tt_metal::NOC::NOC_1;
+        erisc1_ethernet_config.noc = tt_metal::NOC::NOC_1;
+        ASSERT_TRUE(unit_tests::erisc::kernels::noc_reader_and_writer_kernels(
+            device_, read_write_size_bytes, reader_dst_address, writer_src_address, eth_core, erisc0_ethernet_config, erisc1_ethernet_config
+        ));
+    }
+}
diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp
similarity index 62%
rename from tests/tt_metal/tt_metal/unit_tests/ethernet/buffer_movement_kernels.cpp
rename to tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp
index dd9c95aab8f..746912b90aa 100644
--- a/tests/tt_metal/tt_metal/unit_tests/ethernet/buffer_movement_kernels.cpp
+++ b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp
@@ -3,26 +3,23 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <random>
+#include <thread>
 
 #include "device_fixture.hpp"
-#include "n300_device_fixture.hpp"
+#include "command_queue_fixture.hpp"
+#include "dispatch_fixture.hpp"
+#include "multi_device_fixture.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
 using namespace tt;
 using namespace tt::test_utils;
-using namespace tt::test_utils::df;
 
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
 constexpr std::int32_t MAX_BUFFER_SIZE =
     (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
 
@@ -30,14 +27,17 @@ struct BankedConfig {
     size_t num_pages = 1;
     size_t size_bytes = 1 * 2 * 32 * 32;
     size_t page_size_bytes = 2 * 32 * 32;
-    BufferType input_buffer_type = BufferType::L1;
-    BufferType output_buffer_type = BufferType::L1;
+    tt_metal::BufferType input_buffer_type = tt_metal::BufferType::L1;
+    tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1;
     tt::DataFormat l1_data_format = tt::DataFormat::Float16_b;
 };
+}  // namespace CMAKE_UNIQUE_NAMESPACE
+}
 
 namespace unit_tests::erisc::kernels {
 
 bool chip_to_chip_dram_buffer_transfer(
+    DispatchFixture* fixture,
     tt_metal::Device* sender_device,
     tt_metal::Device* receiver_device,
     const CoreCoord& eth_sender_core,
@@ -86,7 +86,7 @@ bool chip_to_chip_dram_buffer_transfer(
     // Generate inputs
     auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
 
-    tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs);
+    fixture->WriteBuffer(sender_device, input_dram_buffer, inputs);
 
     const uint32_t MAX_BUFFER =
         (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
@@ -95,7 +95,7 @@ bool chip_to_chip_dram_buffer_transfer(
     // Clear expected value at ethernet L1 address
     std::vector<uint32_t> all_zeros(inputs.size(), 0);
 
-    tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros);
+    fixture->WriteBuffer(receiver_device, output_dram_buffer, all_zeros);
 
     ////////////////////////////////////////////////////////////////////////////
     //                      Sender Device
@@ -148,14 +148,26 @@ bool chip_to_chip_dram_buffer_transfer(
     ////////////////////////////////////////////////////////////////////////////
     //                      Execute Programs
     ////////////////////////////////////////////////////////////////////////////
+    std::thread t1;
+    std::thread t2;
+    if (fixture->IsSlowDispatch()) {
+        t1 = std::thread([&]() { fixture->RunProgram(sender_device, sender_program); });
+        t2 = std::thread([&]() { fixture->RunProgram(receiver_device, receiver_program); });
+    } else {
+        fixture->RunProgram(sender_device, sender_program, true);
+        fixture->RunProgram(receiver_device, receiver_program, true);
+    }
 
-    std::thread th1 = std::thread([&] { tt_metal::detail::LaunchProgram(sender_device, sender_program); });
-    std::thread th2 = std::thread([&] { tt_metal::detail::LaunchProgram(receiver_device, receiver_program); });
+    fixture->FinishCommands(sender_device);
+    fixture->FinishCommands(receiver_device);
+
+    if (fixture->IsSlowDispatch()) {
+        t1.join();
+        t2.join();
+    }
 
-    th1.join();
-    th2.join();
     std::vector<uint32_t> dest_dram_data;
-    tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_dram_data);
+    fixture->ReadBuffer(receiver_device, output_dram_buffer, dest_dram_data);
     pass &= (dest_dram_data == inputs);
     if (not pass) {
         std::cout << "Mismatch at Core: " << output_dram_noc_xy.str() << std::endl;
@@ -165,15 +177,15 @@ bool chip_to_chip_dram_buffer_transfer(
 }
 
 bool chip_to_chip_interleaved_buffer_transfer(
+    DispatchFixture* fixture,
     tt_metal::Device* sender_device,
     tt_metal::Device* receiver_device,
     const CoreCoord& eth_sender_core,
     const CoreCoord& eth_receiver_core,
-    const BankedConfig& cfg,
+    const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg,
     const uint32_t& max_transfer_size) {
     bool pass = true;
 
-
     const uint32_t input0_cb_index = 0;
     const uint32_t output_cb_index = 16;
 
@@ -206,7 +218,7 @@ bool chip_to_chip_interleaved_buffer_transfer(
     auto input_buffer = CreateBuffer(sender_config);
     bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM;
 
-    tt_metal::detail::WriteToBuffer(input_buffer, input_packed);
+    fixture->WriteBuffer(sender_device, input_buffer, input_packed);
 
     const uint32_t max_buffer = round_down(max_transfer_size, cfg.page_size_bytes);
     uint32_t pages_per_loop = max_buffer / cfg.page_size_bytes;
@@ -242,6 +254,7 @@ bool chip_to_chip_interleaved_buffer_transfer(
     std::vector<uint32_t> all_zeros(cfg.size_bytes / sizeof(uint32_t), 0);
 
     tt_metal::detail::WriteToBuffer(output_buffer, all_zeros);
+    fixture->WriteBuffer(receiver_device, output_buffer, all_zeros);
 
     auto eth_receiver_kernel = tt_metal::CreateKernel(
         receiver_program,
@@ -266,21 +279,34 @@ bool chip_to_chip_interleaved_buffer_transfer(
     ////////////////////////////////////////////////////////////////////////////
     //                      Execute Programs
     ////////////////////////////////////////////////////////////////////////////
+    std::thread t1;
+    std::thread t2;
+    if (fixture->IsSlowDispatch()) {
+        t1 = std::thread([&]() { fixture->RunProgram(sender_device, sender_program); });
+        t2 = std::thread([&]() { fixture->RunProgram(receiver_device, receiver_program); });
+    } else {
+        fixture->RunProgram(sender_device, sender_program, true);
+        fixture->RunProgram(receiver_device, receiver_program, true);
+    }
+
+    fixture->FinishCommands(sender_device);
+    fixture->FinishCommands(receiver_device);
 
-    std::thread th1 = std::thread([&] { tt_metal::detail::LaunchProgram(sender_device, sender_program); });
-    std::thread th2 = std::thread([&] { tt_metal::detail::LaunchProgram(receiver_device, receiver_program); });
+    if (fixture->IsSlowDispatch()) {
+        t1.join();
+        t2.join();
+    }
 
-    th1.join();
-    th2.join();
     std::vector<uint32_t> dest_buffer_data;
     tt_metal::detail::ReadFromBuffer(output_buffer, dest_buffer_data);
+    fixture->ReadBuffer(receiver_device, output_buffer, dest_buffer_data);
     pass &= input_packed == dest_buffer_data;
     return pass;
 }
 
 }  // namespace unit_tests::erisc::kernels
 
-TEST_F(N300DeviceFixture, EthKernelsSendDramBufferChip0ToChip1) {
+TEST_F(N300DeviceFixture, ActiveEthKernelsSendDramBufferChip0ToChip1) {
     const auto& sender_device = devices_.at(0);
     const auto& receiver_device = devices_.at(1);
 
@@ -288,17 +314,17 @@ TEST_F(N300DeviceFixture, EthKernelsSendDramBufferChip0ToChip1) {
         CoreCoord receiver_eth_core = std::get<1>(sender_device->get_connected_ethernet_core(sender_eth_core));
 
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16));
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024));
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024));
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024));
     }
 }
 
-TEST_F(N300DeviceFixture, EthKernelsSendDramBufferChip1ToChip0) {
+TEST_F(N300DeviceFixture, ActiveEthKernelsSendDramBufferChip1ToChip0) {
     const auto& sender_device = devices_.at(1);
     const auto& receiver_device = devices_.at(0);
 
@@ -306,17 +332,18 @@ TEST_F(N300DeviceFixture, EthKernelsSendDramBufferChip1ToChip0) {
         CoreCoord receiver_eth_core = std::get<1>(sender_device->get_connected_ethernet_core(sender_eth_core));
 
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16));
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024));
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024));
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024));
     }
 }
 
-TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) {
+TEST_F(N300DeviceFixture, ActiveEthKernelsSendInterleavedBufferChip0ToChip1) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     GTEST_SKIP();
     const auto& sender_device = devices_.at(0);
     const auto& receiver_device = devices_.at(1);
@@ -333,6 +360,7 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) {
             receiver_eth_core.str());
         BankedConfig test_config;
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+            static_cast<DispatchFixture*>(this),
             sender_device,
             receiver_device,
             sender_eth_core,
@@ -342,6 +370,7 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) {
         test_config = BankedConfig{.num_pages = 200, .size_bytes = 200 * 2 * 32 * 32, .page_size_bytes = 2 * 32 * 32};
 
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+            static_cast<DispatchFixture*>(this),
             sender_device,
             receiver_device,
             sender_eth_core,
@@ -349,7 +378,7 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) {
             test_config,
             test_config.page_size_bytes));
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
         test_config = BankedConfig{
             .num_pages = 200,
             .size_bytes = 200 * 2 * 32 * 32,
@@ -357,6 +386,7 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) {
             .input_buffer_type = BufferType::DRAM,
             .output_buffer_type = BufferType::DRAM};
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+            static_cast<DispatchFixture*>(this),
             sender_device,
             receiver_device,
             sender_eth_core,
@@ -364,11 +394,12 @@ TEST_F(N300DeviceFixture, EthKernelsSendInterleavedBufferChip0ToChip1) {
             test_config,
             test_config.page_size_bytes));
         ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
-            sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
+            static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
     }
 }
 
-TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) {
+TEST_F(DeviceFixture, ActiveEthKernelsSendInterleavedBufferAllConnectedChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
     for (const auto& sender_device : devices_) {
         for (const auto& receiver_device : devices_) {
             if (sender_device->id() == receiver_device->id()) {
@@ -395,6 +426,97 @@ TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) {
                     .output_buffer_type = BufferType::DRAM};
 
                 ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    sender_eth_core,
+                    receiver_eth_core,
+                    test_config,
+                    test_config.page_size_bytes));
+                ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+                    static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
+                test_config = BankedConfig{
+                    .num_pages = 200,
+                    .size_bytes = 200 * 2 * 32 * 32,
+                    .page_size_bytes = 2 * 32 * 32,
+                    .input_buffer_type = BufferType::DRAM,
+                    .output_buffer_type = BufferType::L1};
+                ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    sender_eth_core,
+                    receiver_eth_core,
+                    test_config,
+                    test_config.page_size_bytes));
+                ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+                    static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
+            }
+        }
+    }
+}
+
+TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsSendDramBufferAllConnectedChips) {
+    for (const auto& sender_device : devices_) {
+        for (const auto& receiver_device : devices_) {
+            if (sender_device->id() >= receiver_device->id()) {
+                continue;
+            }
+            for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) {
+                auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core);
+                if (receiver_device->id() != device_id) {
+                    continue;
+                }
+                log_info(
+                    tt::LogTest,
+                    "Sending dram buffer from device {} to device {}, using eth core {} and {}",
+                    sender_device->id(),
+                    receiver_device->id(),
+                    sender_eth_core.str(),
+                    receiver_eth_core.str());
+
+                ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
+                    static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16));
+                ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
+                    static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024));
+                ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
+                    static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024));
+                ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
+                    static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024));
+            }
+        }
+    }
+}
+
+TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsSendInterleavedBufferAllConnectedChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    for (const auto& sender_device : devices_) {
+        for (const auto& receiver_device : devices_) {
+            if (sender_device->id() >= receiver_device->id()) {
+                continue;
+            }
+            for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) {
+                auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core);
+                if (receiver_device->id() != device_id) {
+                    continue;
+                }
+
+                log_info(
+                    tt::LogTest,
+                    "Sending interleaved buffer from device {} to device {}, using eth core {} and {}",
+                    sender_device->id(),
+                    receiver_device->id(),
+                    sender_eth_core.str(),
+                    receiver_eth_core.str());
+                BankedConfig test_config = BankedConfig{
+                    .num_pages = 200,
+                    .size_bytes = 200 * 2 * 32 * 32,
+                    .page_size_bytes = 2 * 32 * 32,
+                    .input_buffer_type = BufferType::L1,
+                    .output_buffer_type = BufferType::DRAM};
+
+                ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+                    static_cast<DispatchFixture*>(this),
                     sender_device,
                     receiver_device,
                     sender_eth_core,
@@ -402,7 +524,7 @@ TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) {
                     test_config,
                     test_config.page_size_bytes));
                 ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
-                    sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
+                    static_cast<DispatchFixture*>(this),sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
                 test_config = BankedConfig{
                     .num_pages = 200,
                     .size_bytes = 200 * 2 * 32 * 32,
@@ -410,6 +532,7 @@ TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) {
                     .input_buffer_type = BufferType::DRAM,
                     .output_buffer_type = BufferType::L1};
                 ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
+                    static_cast<DispatchFixture*>(this),
                     sender_device,
                     receiver_device,
                     sender_eth_core,
@@ -417,7 +540,7 @@ TEST_F(DeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) {
                     test_config,
                     test_config.page_size_bytes));
                 ASSERT_TRUE(unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
-                    sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
+                    static_cast<DispatchFixture*>(this), sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
             }
         }
     }
diff --git a/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp
new file mode 100644
index 00000000000..84114813967
--- /dev/null
+++ b/tests/tt_metal/tt_metal/eth/test_erisc_app_direct_send.cpp
@@ -0,0 +1,835 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <functional>
+#include <random>
+
+#include "device_fixture.hpp"
+#include "dispatch_fixture.hpp"
+#include "multi_device_fixture.hpp"
+#include "command_queue_fixture.hpp"
+#include "tt_metal/common/logger.hpp"
+#include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/host_api.hpp"
+#include "tt_metal/test_utils/stimulus.hpp"
+
+namespace {
+namespace CMAKE_UNIQUE_NAMESPACE {
+constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
+constexpr std::int32_t MAX_NUM_WORDS = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_SIZE / WORD_SIZE;
+
+struct erisc_info_t {
+  volatile uint32_t num_bytes;
+  volatile uint32_t mode;
+  volatile uint32_t reserved_0_;
+  volatile uint32_t reserved_1_;
+  volatile uint32_t bytes_done;
+  volatile uint32_t reserverd_2_;
+  volatile uint32_t reserverd_3_;
+  volatile uint32_t reserverd_4_;
+};
+}
+}
+
+using namespace tt;
+using namespace tt::test_utils;
+
+namespace unit_tests::erisc::direct_send {
+const size_t get_rand_32_byte_aligned_address(const size_t& base, const size_t& max) {
+    TT_ASSERT(!(base & 0x1F) and !(max & 0x1F));
+    size_t word_size = (max >> 5) - (base >> 5);
+    return (((rand() % word_size) << 5) + base);
+}
+
+bool eth_direct_sender_receiver_kernels(
+    DispatchFixture* fixture,
+    tt_metal::Device* sender_device,
+    tt_metal::Device* receiver_device,
+    const size_t& byte_size,
+    const size_t& src_eth_l1_byte_address,
+    const size_t& dst_eth_l1_byte_address,
+    const CoreCoord& eth_sender_core,
+    const CoreCoord& eth_receiver_core,
+    uint32_t num_bytes_per_send = 16) {
+    bool pass = true;
+    log_debug(
+        tt::LogTest,
+        "Sending {} bytes from device {} eth core {} addr {} to device {} eth core {} addr {}",
+        byte_size,
+        sender_device->id(),
+        eth_sender_core.str(),
+        src_eth_l1_byte_address,
+        receiver_device->id(),
+        eth_receiver_core.str(),
+        dst_eth_l1_byte_address);
+    // Generate inputs
+    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
+    llrt::write_hex_vec_to_core(
+        sender_device->id(),
+        sender_device->ethernet_core_from_logical_core(eth_sender_core),
+        inputs,
+        src_eth_l1_byte_address);
+
+    // Clear expected value at ethernet L1 address
+    std::vector<uint32_t> all_zeros(inputs.size(), 0);
+    llrt::write_hex_vec_to_core(
+        receiver_device->id(),
+        receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
+        all_zeros,
+        dst_eth_l1_byte_address);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Sender Device
+    ////////////////////////////////////////////////////////////////////////////
+    tt_metal::Program sender_program = tt_metal::Program();
+
+    auto eth_sender_kernel = tt_metal::CreateKernel(
+        sender_program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_send.cpp",
+        eth_sender_core,
+        tt_metal::EthernetConfig{
+            .noc = tt_metal::NOC::NOC_0,
+            .compile_args = {uint32_t(num_bytes_per_send), uint32_t(num_bytes_per_send >> 4)}});
+
+    tt_metal::SetRuntimeArgs(
+        sender_program,
+        eth_sender_kernel,
+        eth_sender_core,
+        {
+            (uint32_t)src_eth_l1_byte_address,
+            (uint32_t)dst_eth_l1_byte_address,
+            (uint32_t)byte_size,
+        });
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Receiver Device
+    ////////////////////////////////////////////////////////////////////////////
+    tt_metal::Program receiver_program = tt_metal::Program();
+
+    auto eth_receiver_kernel = tt_metal::CreateKernel(
+        receiver_program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_receive.cpp",
+        eth_receiver_core,
+        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0});  // probably want to use NOC_1 here
+
+    tt_metal::SetRuntimeArgs(
+        receiver_program,
+        eth_receiver_kernel,
+        eth_receiver_core,
+        {
+            (uint32_t)byte_size,
+        });
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Execute Programs
+    ////////////////////////////////////////////////////////////////////////////
+    std::thread t1;
+    std::thread t2;
+    if (fixture->IsSlowDispatch()) {
+        t1 = std::thread([&]() { fixture->RunProgram(sender_device, sender_program); });
+        t2 = std::thread([&]() { fixture->RunProgram(receiver_device, receiver_program); });
+    } else {
+        fixture->RunProgram(sender_device, sender_program, true);
+        fixture->RunProgram(receiver_device, receiver_program, true);
+    }
+
+    fixture->FinishCommands(sender_device);
+    fixture->FinishCommands(receiver_device);
+
+    if (fixture->IsSlowDispatch()) {
+        t1.join();
+        t2.join();
+    }
+
+    auto readback_vec = llrt::read_hex_vec_from_core(
+        receiver_device->id(),
+        receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
+        dst_eth_l1_byte_address,
+        byte_size);
+    pass &= (readback_vec == inputs);
+    if (not pass) {
+        std::cout << "Mismatch at Core: " << eth_receiver_core.str() << std::endl;
+        std::cout << readback_vec[0] << std::endl;
+    }
+    return pass;
+}
+
+// Tests ethernet direct send/receive from ERISC_L1_UNRESERVED_BASE
+bool send_over_eth(
+    tt_metal::Device* sender_device,
+    tt_metal::Device* receiver_device,
+    const CoreCoord& sender_core,
+    const CoreCoord& receiver_core,
+    const size_t& byte_size) {
+    tt::log_debug(
+        tt::LogTest,
+        "Running direct send test with sender chip {} core {}, receiver chip {} core {}, sending {} bytes",
+        sender_device->id(),
+        sender_core.str(),
+        receiver_device->id(),
+        receiver_core.str(),
+        byte_size);
+    std::vector<CoreCoord> eth_cores = {
+        CoreCoord(9, 0),
+        CoreCoord(1, 0),
+        CoreCoord(8, 0),
+        CoreCoord(2, 0),
+        CoreCoord(9, 6),
+        CoreCoord(1, 6),
+        CoreCoord(8, 6),
+        CoreCoord(2, 6),
+        CoreCoord(7, 0),
+        CoreCoord(3, 0),
+        CoreCoord(6, 0),
+        CoreCoord(4, 0),
+        CoreCoord(7, 6),
+        CoreCoord(3, 6),
+        CoreCoord(6, 6),
+        CoreCoord(4, 6)};
+
+    // Disable all eth core runtime app flags, zero out data write counter
+    std::vector<uint32_t> run_test_app_flag = {0x0};
+    for (const auto& eth_core : eth_cores) {
+        llrt::write_hex_vec_to_core(
+            sender_device->id(), eth_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
+        llrt::write_hex_vec_to_core(
+            receiver_device->id(), eth_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
+        std::vector<uint32_t> zero = {0, 0, 0, 0, 0, 0, 0, 0};
+        llrt::write_hex_vec_to_core(
+            sender_device->id(), eth_core, zero, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
+        llrt::write_hex_vec_to_core(
+            receiver_device->id(), eth_core, zero, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
+    }
+
+    // TODO: is it possible that receiver core app is stil running when we push inputs here???
+    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
+    llrt::write_hex_vec_to_core(
+        sender_device->id(), sender_core, inputs, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
+
+    // Zero out receiving address to ensure no stale data is causing tests to pass
+    std::vector<uint32_t> all_zeros(inputs.size(), 0);
+    llrt::write_hex_vec_to_core(
+        receiver_device->id(), receiver_core, all_zeros, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
+
+    std::vector<uint32_t> args_0 = {uint32_t(byte_size), 0};
+    llrt::write_hex_vec_to_core(sender_device->id(), sender_core, args_0, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
+    std::vector<uint32_t> args_1 = {uint32_t(byte_size), 1};
+    llrt::write_hex_vec_to_core(receiver_device->id(), receiver_core, args_1, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
+
+    // TODO: this should be updated to use kernel api
+    uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
+    ll_api::memory binary_mem_send = llrt::get_risc_binary(
+        sender_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0);
+    ll_api::memory binary_mem_receive = llrt::get_risc_binary(
+        receiver_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0);
+
+    for (const auto& eth_core : eth_cores) {
+        llrt::write_hex_vec_to_core(
+            sender_device->id(), eth_core, binary_mem_send.data(), eth_l1_mem::address_map::FIRMWARE_BASE);
+        llrt::write_hex_vec_to_core(
+            receiver_device->id(), eth_core, binary_mem_receive.data(), eth_l1_mem::address_map::FIRMWARE_BASE);
+    }
+
+    // Activate sender core runtime app
+    run_test_app_flag = {0x1};
+    // send remote first, otherwise eth core may be blocked, very ugly for now...
+    if (receiver_device->id() == 1) {
+        llrt::write_hex_vec_to_core(
+            1, receiver_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
+    } else {
+        llrt::write_hex_vec_to_core(1, sender_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
+    }
+    if (sender_device->id() == 0) {
+        llrt::write_hex_vec_to_core(0, sender_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
+    } else {
+        llrt::write_hex_vec_to_core(
+            0, receiver_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
+    }
+
+    bool pass = true;
+    auto readback_vec = llrt::read_hex_vec_from_core(
+        receiver_device->id(), receiver_core, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, byte_size);
+    pass &= (readback_vec == inputs);
+
+    return pass;
+}
+
+}  // namespace unit_tests::erisc::direct_send
+
+TEST_F(N300DeviceFixture, ActiveEthSingleCoreDirectSendChip0ToChip1) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    GTEST_SKIP();
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+    CoreCoord sender_core_0 = CoreCoord(9, 6);
+    CoreCoord sender_core_1 = CoreCoord(1, 6);
+
+    CoreCoord receiver_core_0 = CoreCoord(9, 0);
+    CoreCoord receiver_core_1 = CoreCoord(1, 0);
+
+    ASSERT_TRUE(
+        unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE));
+    ASSERT_TRUE(
+        unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 256));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 256));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 1024));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 1024));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS));
+}
+
+TEST_F(N300DeviceFixture, ActiveEthSingleCoreDirectSendChip1ToChip0) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    GTEST_SKIP();
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+    CoreCoord sender_core_0 = CoreCoord(9, 0);
+    CoreCoord sender_core_1 = CoreCoord(1, 0);
+
+    CoreCoord receiver_core_0 = CoreCoord(9, 6);
+    CoreCoord receiver_core_1 = CoreCoord(1, 6);
+
+    ASSERT_TRUE(
+        unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE));
+    ASSERT_TRUE(
+        unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * 256));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * 256));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * 1024));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * 1024));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS));
+}
+
+TEST_F(N300DeviceFixture, ActiveEthBidirectionalCoreDirectSend) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    GTEST_SKIP();
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+    CoreCoord sender_core_0 = CoreCoord(9, 6);
+    CoreCoord sender_core_1 = CoreCoord(1, 6);
+
+    CoreCoord receiver_core_0 = CoreCoord(9, 0);
+    CoreCoord receiver_core_1 = CoreCoord(1, 0);
+
+    ASSERT_TRUE(
+        unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE));
+    ASSERT_TRUE(
+        unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE));
+    ASSERT_TRUE(
+        unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE));
+    ASSERT_TRUE(
+        unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 256));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * 256));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 256));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * 256));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 1024));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * 1024));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 1024));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * 1024));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * MAX_NUM_WORDS));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS));
+    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+        device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * MAX_NUM_WORDS));
+}
+
+TEST_F(N300DeviceFixture, ActiveEthRandomDirectSendTests) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    GTEST_SKIP();
+    srand(0);
+
+    std::map<std::pair<int, CoreCoord>, std::pair<int, CoreCoord>> connectivity = {
+        {{0, CoreCoord(9, 6)}, {1, CoreCoord(9, 0)}},
+        {{1, CoreCoord(9, 0)}, {0, CoreCoord(9, 6)}},
+        {{0, CoreCoord(1, 6)}, {1, CoreCoord(1, 0)}},
+        {{1, CoreCoord(1, 0)}, {0, CoreCoord(1, 6)}}};
+    for (int i = 0; i < 1000; i++) {
+        auto it = connectivity.begin();
+        std::advance(it, rand() % (connectivity.size()));
+
+        const auto& send_chip = devices_.at(std::get<0>(it->first));
+        CoreCoord sender_core = std::get<1>(it->first);
+        const auto& receiver_chip = devices_.at(std::get<0>(it->second));
+        CoreCoord receiver_core = std::get<1>(it->second);
+        int num_words = 0;
+        if constexpr (MAX_NUM_WORDS != 0) {
+            num_words = rand() % MAX_NUM_WORDS + 1;
+        }
+
+        ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
+            send_chip, receiver_chip, sender_core, receiver_core, WORD_SIZE * num_words));
+    }
+}
+
+TEST_F(N300DeviceFixture, ActiveEthKernelsDirectSendChip0ToChip1) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    GTEST_SKIP();
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+
+    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
+        auto [device_id, receiver_core] = device_0->get_connected_ethernet_core(sender_core);
+        if (device_1->id() != device_id) {
+            continue;
+        }
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_0,
+            device_1,
+            WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_0,
+            device_1,
+            4 * WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_0,
+            device_1,
+            256 * WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_0,
+            device_1,
+            1000 * WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+    }
+}
+
+TEST_F(N300DeviceFixture, ActiveEthKernelsDirectSendChip1ToChip0) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    GTEST_SKIP();
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+
+    for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) {
+        auto [device_id, receiver_core] = device_1->get_connected_ethernet_core(sender_core);
+        if (device_0->id() != device_id) {
+            continue;
+        }
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_1,
+            device_0,
+            WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_1,
+            device_0,
+            4 * WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_1,
+            device_0,
+            256 * WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_1,
+            device_0,
+            1000 * WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+    }
+}
+
+TEST_F(DeviceFixture, ActiveEthKernelsDirectSendAllConnectedChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    for (const auto& sender_device : devices_) {
+        for (const auto& receiver_device : devices_) {
+            if (sender_device->id() == receiver_device->id()) {
+                continue;
+            }
+            for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) {
+                auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core);
+                if (receiver_device->id() != device_id) {
+                    continue;
+                }
+                ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    WORD_SIZE,
+                    src_eth_l1_byte_address,
+                    dst_eth_l1_byte_address,
+                    sender_core,
+                    receiver_core));
+                ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    4 * WORD_SIZE,
+                    src_eth_l1_byte_address,
+                    dst_eth_l1_byte_address,
+                    sender_core,
+                    receiver_core));
+                ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    256 * WORD_SIZE,
+                    src_eth_l1_byte_address,
+                    dst_eth_l1_byte_address,
+                    sender_core,
+                    receiver_core));
+                ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    1000 * WORD_SIZE,
+                    src_eth_l1_byte_address,
+                    dst_eth_l1_byte_address,
+                    sender_core,
+                    receiver_core));
+            }
+        }
+    }
+}
+
+TEST_F(N300DeviceFixture, ActiveEthKernelsBidirectionalDirectSend) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+
+    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
+        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_0,
+            device_1,
+            WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_1,
+            device_0,
+            WORD_SIZE,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            receiver_core,
+            sender_core));
+    }
+    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
+        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_0,
+            device_1,
+            WORD_SIZE * 256,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_1,
+            device_0,
+            WORD_SIZE * 256,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            receiver_core,
+            sender_core));
+    }
+    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
+        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_0,
+            device_1,
+            WORD_SIZE * 1024,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_1,
+            device_0,
+            WORD_SIZE * 1024,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            receiver_core,
+            sender_core));
+    }
+    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
+        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_0,
+            device_1,
+            WORD_SIZE * MAX_NUM_WORDS,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            device_1,
+            device_0,
+            WORD_SIZE * MAX_NUM_WORDS,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            receiver_core,
+            sender_core));
+    }
+}
+
+TEST_F(N300DeviceFixture, ActiveEthKernelsRepeatedDirectSends) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+
+    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
+        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
+        for (int i = 0; i < 10; i++) {
+            ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                static_cast<DispatchFixture*>(this),
+                device_0,
+                device_1,
+                WORD_SIZE,
+                src_eth_l1_byte_address + WORD_SIZE * i,
+                dst_eth_l1_byte_address + WORD_SIZE * i,
+                sender_core,
+                receiver_core));
+        }
+        for (int i = 0; i < 10; i++) {
+            ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                static_cast<DispatchFixture*>(this),
+                device_1,
+                device_0,
+                WORD_SIZE,
+                src_eth_l1_byte_address + WORD_SIZE * i,
+                dst_eth_l1_byte_address + WORD_SIZE * i,
+                receiver_core,
+                sender_core));
+        }
+    }
+}
+
+TEST_F(N300DeviceFixture, ActiveEthKernelsRandomDirectSendTests) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    srand(0);
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+
+    std::map<std::tuple<int, CoreCoord>, std::tuple<int, CoreCoord>> connectivity = {};
+    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
+        const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core);
+        connectivity.insert({{0, sender_core}, receiver_core});
+    }
+    for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) {
+        const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core);
+        connectivity.insert({{1, sender_core}, receiver_core});
+    }
+    for (int i = 0; i < 1000; i++) {
+        auto it = connectivity.begin();
+        std::advance(it, rand() % (connectivity.size()));
+
+        const auto& send_chip = devices_.at(std::get<0>(it->first));
+        CoreCoord sender_core = std::get<1>(it->first);
+        const auto& receiver_chip = devices_.at(std::get<0>(it->second));
+        CoreCoord receiver_core = std::get<1>(it->second);
+
+        const size_t src_eth_l1_byte_address = unit_tests::erisc::direct_send::get_rand_32_byte_aligned_address(
+            eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE);
+        const size_t dst_eth_l1_byte_address = unit_tests::erisc::direct_send::get_rand_32_byte_aligned_address(
+            eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE);
+
+        int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE -
+                         std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) /
+                        WORD_SIZE;
+        int num_words = rand() % max_words + 1;
+
+        ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+            static_cast<DispatchFixture*>(this),
+            send_chip,
+            receiver_chip,
+            WORD_SIZE * num_words,
+            src_eth_l1_byte_address,
+            dst_eth_l1_byte_address,
+            sender_core,
+            receiver_core));
+    }
+}
+TEST_F(N300DeviceFixture, ActiveEthKernelsRandomEthPacketSizeDirectSendTests) {
+    srand(0);
+    const auto& device_0 = devices_.at(0);
+    const auto& device_1 = devices_.at(1);
+
+    std::map<std::tuple<int, CoreCoord>, std::tuple<int, CoreCoord>> connectivity = {};
+    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
+        const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core);
+        connectivity.insert({{0, sender_core}, receiver_core});
+    }
+    for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) {
+        const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core);
+        connectivity.insert({{1, sender_core}, receiver_core});
+    }
+    std::vector<uint32_t> num_bytes_per_send_test_vals = {
+        16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536};
+    for (const auto& num_bytes_per_send : num_bytes_per_send_test_vals) {
+        log_info(tt::LogTest, "Random eth send tests with {} bytes per packet", num_bytes_per_send);
+        for (int i = 0; i < 10; i++) {
+            auto it = connectivity.begin();
+            std::advance(it, rand() % (connectivity.size()));
+
+            const auto& send_chip = devices_.at(std::get<0>(it->first));
+            CoreCoord sender_core = std::get<1>(it->first);
+            const auto& receiver_chip = devices_.at(std::get<0>(it->second));
+            CoreCoord receiver_core = std::get<1>(it->second);
+
+            const size_t src_eth_l1_byte_address = unit_tests::erisc::direct_send::get_rand_32_byte_aligned_address(
+                eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE,
+                eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - 65536);
+            const size_t dst_eth_l1_byte_address = unit_tests::erisc::direct_send::get_rand_32_byte_aligned_address(
+                eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE,
+                eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - 65536);
+
+            int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE -
+                             std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) /
+                            num_bytes_per_send;
+            int num_words = rand() % max_words + 1;
+
+            ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                static_cast<DispatchFixture*>(this),
+                send_chip,
+                receiver_chip,
+                num_bytes_per_send * num_words,
+                src_eth_l1_byte_address,
+                dst_eth_l1_byte_address,
+                sender_core,
+                receiver_core,
+                num_bytes_per_send));
+        }
+    }
+}
+
+TEST_F(CommandQueueMultiDeviceProgramFixture, ActiveEthKernelsDirectSendAllConnectedChips) {
+    using namespace CMAKE_UNIQUE_NAMESPACE;
+    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
+    for (const auto& sender_device : devices_) {
+        for (const auto& receiver_device : devices_) {
+            if (sender_device->id() >= receiver_device->id()) {
+                continue;
+            }
+            for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) {
+                auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core);
+                if (receiver_device->id() != device_id) {
+                    continue;
+                }
+                ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    WORD_SIZE,
+                    src_eth_l1_byte_address,
+                    dst_eth_l1_byte_address,
+                    sender_core,
+                    receiver_core));
+                ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    4 * WORD_SIZE,
+                    src_eth_l1_byte_address,
+                    dst_eth_l1_byte_address,
+                    sender_core,
+                    receiver_core));
+                ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    256 * WORD_SIZE,
+                    src_eth_l1_byte_address,
+                    dst_eth_l1_byte_address,
+                    sender_core,
+                    receiver_core));
+                ASSERT_TRUE(unit_tests::erisc::direct_send::eth_direct_sender_receiver_kernels(
+                    static_cast<DispatchFixture*>(this),
+                    sender_device,
+                    receiver_device,
+                    1000 * WORD_SIZE,
+                    src_eth_l1_byte_address,
+                    dst_eth_l1_byte_address,
+                    sender_core,
+                    receiver_core));
+            }
+        }
+    }
+}
diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
similarity index 99%
rename from tests/tt_metal/tt_metal/unit_tests/ethernet/ring_gather_kernels.cpp
rename to tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
index 24ea1924810..03c022dca95 100644
--- a/tests/tt_metal/tt_metal/unit_tests/ethernet/ring_gather_kernels.cpp
+++ b/tests/tt_metal/tt_metal/eth/test_ring_gather_kernels.cpp
@@ -9,7 +9,7 @@
 #include <random>
 
 #include "device_fixture.hpp"
-#include "n300_device_fixture.hpp"
+#include "multi_device_fixture.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/kernels/kernel.hpp"
@@ -453,7 +453,7 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels(
 
 }  // namespace unit_tests::erisc::kernels
 
-TEST_F(DeviceFixture, EthKernelsDirectRingGatherAllChips) {
+TEST_F(DeviceFixture, ActiveEthKernelsDirectRingGatherAllChips) {
     const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32;
     const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32;
     const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
@@ -465,7 +465,7 @@ TEST_F(DeviceFixture, EthKernelsDirectRingGatherAllChips) {
         device_ring, WORD_SIZE, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address));
 }
 
-TEST_F(DeviceFixture, EthKernelsInterleavedRingGatherAllChips) {
+TEST_F(DeviceFixture, ActiveEthKernelsInterleavedRingGatherAllChips) {
     const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32;
     const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32;
     const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_basic_pipeline.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_basic_pipeline.cpp
deleted file mode 120000
index 3584a28b68e..00000000000
--- a/tests/tt_metal/tt_metal/gtest_smoke/test_basic_pipeline.cpp
+++ /dev/null
@@ -1 +0,0 @@
-../unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
\ No newline at end of file
diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_device.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_device.cpp
deleted file mode 120000
index 9ae4e0133cd..00000000000
--- a/tests/tt_metal/tt_metal/gtest_smoke/test_device.cpp
+++ /dev/null
@@ -1 +0,0 @@
-../unit_tests_common/basic/test_device_init.cpp
\ No newline at end of file
diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_flatten.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_flatten.cpp
deleted file mode 120000
index dae6734bc34..00000000000
--- a/tests/tt_metal/tt_metal/gtest_smoke/test_flatten.cpp
+++ /dev/null
@@ -1 +0,0 @@
-../unit_tests_common/compute/test_flatten.cpp
\ No newline at end of file
diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_large_block.cpp
deleted file mode 120000
index c649d4ab585..00000000000
--- a/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_large_block.cpp
+++ /dev/null
@@ -1 +0,0 @@
-../unit_tests_common/compute/matmul/test_matmul_large_block.cpp
\ No newline at end of file
diff --git a/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_multi_core_X_dram.cpp
deleted file mode 120000
index 066de75928e..00000000000
--- a/tests/tt_metal/tt_metal/gtest_smoke/test_matmul_multi_core_X_dram.cpp
+++ /dev/null
@@ -1 +0,0 @@
-../unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
\ No newline at end of file
diff --git a/tests/tt_metal/tt_metal/gtest_smoke/tests_main.cpp b/tests/tt_metal/tt_metal/gtest_smoke/tests_main.cpp
deleted file mode 100644
index 660438fe72c..00000000000
--- a/tests/tt_metal/tt_metal/gtest_smoke/tests_main.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "gtest/gtest.h"
-
-/*
-All the tests in gtest_smoke are symlinks. This test suite is meant to be used for sanity checks.
-*/
diff --git a/tests/tt_metal/tt_metal/integration/CMakeLists.txt b/tests/tt_metal/tt_metal/integration/CMakeLists.txt
new file mode 100644
index 00000000000..45df1c02483
--- /dev/null
+++ b/tests/tt_metal/tt_metal/integration/CMakeLists.txt
@@ -0,0 +1,34 @@
+set(UNIT_TESTS_INTEGRATION_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_autonomous_relay_streams.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_basic_pipeline.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_flatten.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_sfpu_compute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_large_block.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_multi_core_X_dram.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_single_core.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul/test_matmul_X_tile.cpp
+)
+
+add_executable(unit_tests_integration ${UNIT_TESTS_INTEGRATION_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_integration)
+
+target_link_libraries(unit_tests_integration PUBLIC test_metal_common_libs)
+target_include_directories(
+    unit_tests_integration
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+set_target_properties(
+    unit_tests_integration
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp
index 3c452584cdf..4af31133b7a 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_X_tile.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp
@@ -2,11 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
@@ -15,7 +11,7 @@
 #include "test_tiles.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "matmul_test_utils.hpp"
 
 using std::vector;
 using namespace tt;
@@ -83,7 +79,7 @@ void set_math_fid_masks(uint16_t &math_fid_mask, MathFidelity math_fidelity = Ma
     }
 }
 
-void matmul_tile(CommonFixture *fixture, tt_metal::Device *device, const MatmulTileConfig &cfg, vector<uint32_t> activations, vector<uint32_t> weights, vector<bfloat16> tensor_vals){
+void matmul_tile(DispatchFixture *fixture, tt_metal::Device *device, const MatmulTileConfig &cfg, vector<uint32_t> activations, vector<uint32_t> weights, vector<bfloat16> tensor_vals){
 
     tt_metal::Program program = tt_metal::CreateProgram();
     CoreCoord core = {0, 0};
@@ -351,7 +347,7 @@ using namespace unit_tests_common::matmul::test_matmul_X_tile;
 }
 */
 
-TEST_F(CommonFixture, MatmulSingleTile){
+TEST_F(DispatchFixture, TensixMatmulSingleTile){
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -377,7 +373,7 @@ TEST_F(CommonFixture, MatmulSingleTile){
     }
 }
 
-TEST_F(CommonFixture, MatmulMultiTile){
+TEST_F(DispatchFixture, TensixMatmulMultiTile){
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -410,7 +406,7 @@ TEST_F(CommonFixture, MatmulMultiTile){
     }
 }
 
-TEST_F(CommonFixture, MatmulBlock){
+TEST_F(DispatchFixture, TensixMatmulBlock){
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -441,7 +437,7 @@ TEST_F(CommonFixture, MatmulBlock){
     }
 }
 
-TEST_F(CommonFixture, MatmulBlockInitShort){
+TEST_F(DispatchFixture, TensixMatmulBlockInitShort){
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -472,7 +468,7 @@ TEST_F(CommonFixture, MatmulBlockInitShort){
     }
 }
 
-TEST_F(CommonFixture, MatmulBlockInitShortWithDt){
+TEST_F(DispatchFixture, TensixMatmulBlockInitShortWithDt){
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         for (bool fp32_dest_acc_en : {true, false}) {
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp
rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp
index 42061795180..88f84f8c1fc 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp
@@ -2,19 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "test_tiles.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
-#include "tests/tt_metal/test_utils/print_helpers.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "matmul_test_utils.hpp"
 
 using std::vector;
 using namespace tt;
@@ -137,7 +132,7 @@ void create_CBs_for_fused_matmul(tt_metal::Program &program, tt_metal::Device* d
     }
 }
 
-bool matmul_large_block(CommonFixture *fixture, tt_metal::Device *device, bool activations_rm, bool output_rm, MathFidelity math_fidelity = MathFidelity::HiFi4) {
+bool matmul_large_block(DispatchFixture *fixture, tt_metal::Device *device, bool activations_rm, bool output_rm, MathFidelity math_fidelity = MathFidelity::HiFi4) {
     bool pass = true;
 
     tt_metal::Program program = tt_metal::CreateProgram();
@@ -365,7 +360,7 @@ bool matmul_large_block(CommonFixture *fixture, tt_metal::Device *device, bool a
 
 }
 
-TEST_F(CommonFixture, MatmulLargeBlock) {
+TEST_F(DispatchFixture, TensixMatmulLargeBlock) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;;
         tt::log_info(tt::LogTest, "Math Fidelity = {}", i);
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp
index 8371a43d96c..d54a2e9f546 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_X_dram.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp
@@ -3,10 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <algorithm>
-#include <functional>
-#include <random>
 
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
@@ -14,8 +12,8 @@
 #include "test_tiles.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
-#include "tests/tt_metal/test_utils/print_helpers.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "matmul_test_utils.hpp"
+
 using std::vector;
 using namespace tt;
 
@@ -50,8 +48,8 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle , tt_metal::KernelHandle> c
     CoreCoord end_core = {(std::size_t)num_cores_c - 1, (std::size_t)num_cores_r - 1};
     const CoreRange all_cores(start_core, end_core);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {ouput_cb_index, tt::DataFormat::Float16_b},
         {interm0_cb_index, tt::DataFormat::Float16_b}
@@ -60,12 +58,12 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle , tt_metal::KernelHandle> c
     int cores_c = cfg.multi_dram ? 1 : num_cores_c;
     for(int i = 0; i < cores_r; i++) {
         for(int j = 0; j < cores_c; j++) {
-            uint32_t src0_cb_index = 0;
+            uint32_t src0_cb_index = tt::CBIndex::c_0;
             uint32_t cb0_tiles = in0_block_tiles * 2;  // double buffer
             tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(cb0_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(src0_cb_index, single_tile_size);
 
-            uint32_t src1_cb_index = 1;
+            uint32_t src1_cb_index = tt::CBIndex::c_1;
             uint32_t cb1_tiles = in1_block_tiles * 2; // double buffer
             tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(cb1_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(src1_cb_index, single_tile_size);
@@ -368,7 +366,7 @@ bool assign_runtime_args_to_program(
     return pass;
 }
 
-bool matmul_multi_core_multi_dram(CommonFixture *fixture, tt_metal::Device *device){
+bool matmul_multi_core_multi_dram(DispatchFixture *fixture, tt_metal::Device *device){
     bool pass = true;
     int num_cores_r = device->compute_with_storage_grid_size().y;
     int num_cores_c = device->compute_with_storage_grid_size().x;
@@ -498,7 +496,7 @@ bool matmul_multi_core_multi_dram(CommonFixture *fixture, tt_metal::Device *devi
 
 }
 
-TEST_F(CommonFixture, MatmulMultiCoreSingleDRAM){
+TEST_F(DispatchFixture, TensixMatmulMultiCoreSingleDRAM){
     const char* arch = getenv("ARCH_NAME");
     if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
         log_info(LogTest, "This test is only supported in slow dispatch mode");
@@ -512,7 +510,7 @@ TEST_F(CommonFixture, MatmulMultiCoreSingleDRAM){
     }
 }
 
-TEST_F(CommonFixture, MatmulMultiCoreMultiDRAM){
+TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAM){
     // need to update move_tiles_to_dram to support both slow and fast
     if (getenv("TT_METAL_SLOW_DISPATCH_MODE")){
         log_info(LogTest, "This test is not supported in slow dispatch mode, need to update move_tiles_to_dram..");
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
index 6dff35cf86f..624fd16edf4 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
@@ -3,10 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <algorithm>
-#include <functional>
-#include <random>
 
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
@@ -14,8 +12,8 @@
 #include "test_tiles.hpp"
 #include "hostdevcommon/common_values.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
-#include "tests/tt_metal/test_utils/print_helpers.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "matmul_test_utils.hpp"
+
 using std::vector;
 using namespace tt;
 namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast {
@@ -86,8 +84,8 @@ create_program(
         {(std::size_t)start_core_x + 1, (std::size_t)start_core_y + 1},
         {(std::size_t)start_core_x + num_cores_c - 1, (std::size_t)start_core_y + num_cores_r - 1});
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {ouput_cb_index, tt::DataFormat::Float16_b},
         {interm0_cb_index, tt::DataFormat::Float16_b}
@@ -96,13 +94,13 @@ create_program(
         for(int j = 0; j < num_cores_c; j++) {
             CoreCoord core = {(std::size_t) start_core_x + j, (std::size_t) start_core_y + i};
 
-            uint32_t src0_cb_index = 0;
+            uint32_t src0_cb_index = tt::CBIndex::c_0;
             uint32_t cb0_tiles = in0_block_tiles * 2; // double buffer
             tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(cb0_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(src0_cb_index, single_tile_size);
             auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-            uint32_t src1_cb_index = 1;
+            uint32_t src1_cb_index = tt::CBIndex::c_1;
             uint32_t cb1_tiles = in1_block_tiles * 2; // double buffer
             tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(cb1_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(src1_cb_index, single_tile_size);
@@ -473,7 +471,7 @@ bool matmul_multi_core_multi_dram_in0_mcast_in1_mcast(tt_metal::Device *device){
 
 } // namespace unit_tests_common::matmul::test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast
 
-TEST_F(CommonFixture, MatmulMultiCoreMultiDRAMIn0MCastIn1MCast) {
+TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn0MCastIn1MCast) {
     if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
         tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode");
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
index 2dfa1ec9ba3..3ef8f727cee 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_multi_dram_inX_mcast.cpp
@@ -3,10 +3,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <algorithm>
-#include <functional>
-#include <random>
 
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
@@ -14,8 +12,8 @@
 #include "test_tiles.hpp"
 #include "hostdevcommon/common_values.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
-#include "tests/tt_metal/test_utils/print_helpers.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "matmul_test_utils.hpp"
+
 using std::vector;
 using namespace tt;
 
@@ -64,8 +62,8 @@ std::
         {(std::size_t)start_core_x + mcast_yx_offset, (std::size_t)start_core_y + mcast_xy_offset},
         {(std::size_t)start_core_x + num_cores_c - 1, (std::size_t)start_core_y + num_cores_r - 1});
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {ouput_cb_index, tt::DataFormat::Float16_b},
         {interm0_cb_index, tt::DataFormat::Float16_b}
@@ -75,13 +73,13 @@ std::
         for(int j = 0; j < num_cores_c; j++) {
             CoreCoord core = {(std::size_t) start_core_x + j, (std::size_t) start_core_y + i};
 
-            uint32_t src0_cb_index = 0;
+            uint32_t src0_cb_index = tt::CBIndex::c_0;
             uint32_t cb0_tiles = in0_block_tiles * 2; // double buffer
             tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(cb0_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(src0_cb_index, single_tile_size);
             auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-            uint32_t src1_cb_index = 1;
+            uint32_t src1_cb_index = tt::CBIndex::c_1;
             uint32_t cb1_tiles = in1_block_tiles * 2; // double buffer
             tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(cb1_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(src1_cb_index, single_tile_size);
@@ -390,7 +388,7 @@ bool matmul_multi_core_multi_dram_inX_mcast(tt_metal::Device *device, int in1_or
 }
 } // namespace unit_tests_common::matmul::test_matmul_multi_core
 
-TEST_F(CommonFixture, MatmulMultiCoreMultiDRAMIn0MCast) {
+TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn0MCast) {
     if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
         tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode");
         GTEST_SKIP();
@@ -400,7 +398,7 @@ TEST_F(CommonFixture, MatmulMultiCoreMultiDRAMIn0MCast) {
     }
 }
 
-TEST_F(CommonFixture, MatmulMultiCoreMultiDRAMIn1MCast) {
+TEST_F(DispatchFixture, TensixMatmulMultiCoreMultiDRAMIn1MCast) {
     if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
         tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode");
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp
rename to tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp
index 167cfb880ed..69eb8edfc83 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/test_matmul_single_core.cpp
+++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp
@@ -2,11 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
@@ -15,14 +11,14 @@
 #include "test_tiles.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tests/tt_metal/test_utils/print_helpers.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "matmul_test_utils.hpp"
 
 using std::vector;
 using namespace tt;
 
 namespace unit_tests_common::matmul::test_matmul_single_core{
 
-bool matmul_single_core(CommonFixture *fixture, tt_metal::Device *device, int M, int N, int K, int out_subblock_h, int out_subblock_w){
+bool matmul_single_core(DispatchFixture *fixture, tt_metal::Device *device, int M, int N, int K, int out_subblock_h, int out_subblock_w){
     bool pass = true;
 
     tt_metal::Program program = tt_metal::CreateProgram();
@@ -82,8 +78,8 @@ bool matmul_single_core(CommonFixture *fixture, tt_metal::Device *device, int M,
         .set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {ouput_cb_index, tt::DataFormat::Float16_b},
         {interm0_cb_index, tt::DataFormat::Float16_b}
@@ -216,7 +212,7 @@ bool matmul_single_core(CommonFixture *fixture, tt_metal::Device *device, int M,
 }
 } // namespace unit_tests_common::matmul::test_matmul_single_core
 
-TEST_F (CommonFixture, MatmulSingleCoreSmall){
+TEST_F (DispatchFixture, TensixMatmulSingleCoreSmall){
     uint32_t M = 4;
     uint32_t K = 4;
     uint32_t N = 4;
@@ -227,7 +223,7 @@ TEST_F (CommonFixture, MatmulSingleCoreSmall){
     }
 }
 
-TEST_F (CommonFixture, MatmulSingleCore){
+TEST_F (DispatchFixture, TensixMatmulSingleCore){
     if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
         log_info(LogTest, "Fast dispatch buffer memory issue, skipping for now");
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp
rename to tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp
index 74080be0bb8..9a10208dcfc 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/streams/test_autonomous_relay_streams.cpp
+++ b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp
@@ -10,9 +10,9 @@
 #include <random>
 #include <tuple>
 
-#include "device/tt_arch_types.h"
 #include "gtest/gtest.h"
-#include "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp"
+#include "device/tt_arch_types.h"
+#include "command_queue_fixture.hpp"
 #include "tt_metal/common/logger.hpp"
 #include "impl/device/device.hpp"
 #include "impl/buffers/circular_buffer.hpp"
@@ -26,7 +26,6 @@
 #include "tt_metal/test_utils/comparison.hpp"
 #include "tt_metal/test_utils/df/df.hpp"
 #include "tt_metal/test_utils/env_vars.hpp"
-// #include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/detail/persistent_kernel_cache.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 
@@ -267,11 +266,6 @@ void build_and_run_autonomous_stream_test(
     uint32_t buffer_size_bytes = num_messages * page_size;
     auto inputs = test_utils::generate_uniform_random_vector<uint32_t>(0, 100, buffer_size_bytes / sizeof(uint32_t));
     std::iota(inputs.begin(), inputs.end(), 1);
-    // for (auto i = 0; i < inputs.size(); i += page_size) {
-    //     for (auto ii = 0; ii < std::min<std::size_t>(page_size, inputs.size() - i); ii++) {
-    //         inputs.at(i + ii) = i + 1;
-    //     }
-    // }
 
     auto zeroes_buffer = std::vector<uint32_t>(buffer_size_bytes / sizeof(uint32_t), 0);
     std::vector<uint32_t> outputs(buffer_size_bytes / sizeof(uint32_t), 0);
@@ -288,7 +282,7 @@ void build_and_run_autonomous_stream_test(
     const uint32_t dram_input_buf_base_addr = input_buffer->address();
 
     // For overlay blob on relay core
-    constexpr uint32_t dummy_cb_index3 = CB::c_in3;
+    constexpr uint32_t dummy_cb_index3 = CBIndex::c_3;
     auto const& relay_stream_overlay_blob_buffer_cb_config =
         tt_metal::CircularBufferConfig(
             relay_stream_overlay_blob_size_bytes, {{dummy_cb_index3, tt::DataFormat::Float16_b}})
@@ -299,7 +293,7 @@ void build_and_run_autonomous_stream_test(
         CreateCircularBuffer(program, second_relay_core, relay_stream_overlay_blob_buffer_cb_config);
 
     // Sender/Receiver CBs for pulling in/pushing out stimulus data taht we can output compare
-    constexpr uint32_t cb_index = CB::c_in0;
+    constexpr uint32_t cb_index = CBIndex::c_0;
     const uint32_t cb_size = page_size_plus_header * read_write_cb_num_pages;
     auto const& cb_config = tt_metal::CircularBufferConfig(cb_size, {{cb_index, tt::DataFormat::Float16_b}})
                                 .set_page_size(cb_index, page_size_plus_header);
@@ -307,7 +301,7 @@ void build_and_run_autonomous_stream_test(
     auto receiver_cb = CreateCircularBuffer(program, receiver_core, cb_config);
 
     // Stream Tile Header Buffers
-    constexpr uint32_t dummy_cb_index2 = CB::c_in2;
+    constexpr uint32_t dummy_cb_index2 = CBIndex::c_2;
     auto const& stream_tile_header_buffer_cb_config =
         tt_metal::CircularBufferConfig(
             stream_tile_header_buffer_size_bytes, {{dummy_cb_index2, tt::DataFormat::Float16_b}})
@@ -321,7 +315,7 @@ void build_and_run_autonomous_stream_test(
     auto receiver_stream_tile_header_buffer_cb =
         CreateCircularBuffer(program, receiver_core, stream_tile_header_buffer_cb_config);
 
-    constexpr uint32_t dummy_cb_index = CB::c_in1;
+    constexpr uint32_t dummy_cb_index = CBIndex::c_1;
     auto const& sender_stream_buffer_cb_config =
         tt_metal::CircularBufferConfig(sender_stream_buffer_size_bytes, {{dummy_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(dummy_cb_index, sender_stream_buffer_size_bytes);
@@ -648,7 +642,7 @@ void build_and_run_autonomous_stream_test(
 
 }  // namespace tt
 
-TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreams) {
+TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreams) {
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -691,7 +685,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreams) {
     return;
 }
 
-TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsSmallPackets) {
+TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsSmallPackets) {
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -734,7 +728,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsSmallPackets) {
     return;
 }
 
-TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingShort) {
+TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsLoopingShort) {
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     if (arch == tt::ARCH::GRAYSKULL) {
@@ -780,7 +774,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingShort) {
 // Too long to run in post commit and these kernels are currently only live in these unit tests anyways
 // so we just enable a couple of the unit tests to ensure nobody accidentally introduces compile errors
 // or anything like that
-TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingRandomShort) {
+TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsLoopingRandomShort) {
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     // if (num_devices != 8) {
@@ -835,13 +829,9 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingRandomShor
 // Too long to run in post commit and these kernels are currently only live in these unit tests anyways
 // so we just enable a couple of the unit tests to ensure nobody accidentally introduces compile errors
 // or anything like that
-TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingLong) {
+TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsLoopingLong) {
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
-    // if (num_devices != 8) {
-    //     log_info(tt::LogTest, "Need at least 2 devices to run this test");
-    //     return;
-    // }
     if (arch == tt::ARCH::GRAYSKULL) {
         log_info(tt::LogTest, "Test must be run on WH");
         return;
@@ -885,7 +875,7 @@ TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsLoopingLong) {
 // Too long to run in post commit and these kernels are currently only live in these unit tests anyways
 // so we just enable a couple of the unit tests to ensure nobody accidentally introduces compile errors
 // or anything like that
-TEST_F(CommandQueueFixture, DISABLED_TestAutonomousRelayStreamsSweep) {
+TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAutonomousRelayStreamsSweep) {
     auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
     auto num_devices = tt::tt_metal::GetNumAvailableDevices();
     if (arch == tt::ARCH::GRAYSKULL) {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp
similarity index 96%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
rename to tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp
index 3fc76d32d74..b5fff829dba 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/pipelining/basic_pipeline.cpp
+++ b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp
@@ -13,13 +13,10 @@
 #include <gtest/gtest.h>
 
 #include "tt_metal/common/bfloat16.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/common/command_queue_fixture.hpp"
+#include "command_queue_fixture.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
+#include "host_api.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
 using std::map;
@@ -44,10 +41,6 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf
 
     tt_metal::Program program = tt_metal::CreateProgram();
 
-    // uint32_t num_tiles = 32;
-    // uint32_t block_size_tiles = 16;
-    // uint32_t num_blocks_in_CB = 2;
-    // uint32_t num_repetitions = 1;
     uint32_t num_cores = (uint32_t)test_config.num_cores;
     uint32_t num_tiles = (uint32_t)test_config.num_tiles;
     uint32_t block_size_tiles = (uint32_t)test_config.block_size_tiles;
@@ -245,7 +238,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf
 
 }  // namespace unit_tests::create_pipeline
 
-TEST_F(CommandQueueFixture, TestPipelineAcrossRows) {
+TEST_F(CommandQueueProgramFixture, TensixTestPipelineAcrossRows) {
     if (this->arch_ != tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp b/tests/tt_metal/tt_metal/integration/test_flatten.cpp
similarity index 53%
rename from tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp
rename to tests/tt_metal/tt_metal/integration/test_flatten.cpp
index 5dbadc80812..af11b8b13f7 100644
--- a/tests/tt_metal/tt_metal/unit_tests_common/compute/test_flatten.cpp
+++ b/tests/tt_metal/tt_metal/integration/test_flatten.cpp
@@ -2,22 +2,16 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "dispatch_fixture.hpp"
+#include "command_queue_fixture.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "common/bfloat16.hpp"
 
-#include "llrt/llrt.hpp"
-
-
 using std::vector;
 using namespace tt;
 
-namespace gtest_smoke::test_flatten{
+namespace test_flatten {
 
 uint32_t prod(vector<uint32_t> &shape) {
     uint32_t shape_prod = 1;
@@ -66,7 +60,7 @@ inline std::vector<uint32_t> gold_standard_flatten(std::vector<uint32_t> src_vec
     return expected_dst_vec;
 }
 
-bool flatten(CommonFixture *fixture, tt_metal::Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) {
+bool flatten(DispatchFixture *fixture, tt_metal::Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) {
     bool pass = true;
 
     tt_metal::Program program = tt_metal::CreateProgram();
@@ -104,7 +98,7 @@ bool flatten(CommonFixture *fixture, tt_metal::Device *device, uint32_t num_tile
         .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 1;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_size);
@@ -189,22 +183,163 @@ bool flatten(CommonFixture *fixture, tt_metal::Device *device, uint32_t num_tile
     return pass;
 }
 
+bool flatten_stress(Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) {
+    // Test Simulating Program Caching with Async Command Queues
+    bool pass = true;
+    // Create a program used across all loops
+    Program program = CreateProgram();
+
+    CoreCoord core = {0, 0};
+
+    uint32_t single_tile_size = 2 * 1024;
+
+    uint32_t num_tiles = num_tiles_r * num_tiles_c;
+    uint32_t num_bytes_per_tensor_row = num_tiles_c * 64;
+    uint32_t num_bytes_per_tile = num_tiles * single_tile_size;
+
+    uint32_t dram_buffer_size = single_tile_size * num_tiles * 32;
+
+    InterleavedBufferConfig dram_config{
+                .device=device,
+                .size = dram_buffer_size,
+                .page_size = dram_buffer_size,
+                .buffer_type = BufferType::DRAM
+                };
+    uint32_t src0_cb_index = 0;
+    uint32_t num_input_tiles = 8;
+    CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
+        .set_page_size(src0_cb_index, single_tile_size);
+    auto cb_src0 = CreateCircularBuffer(program, core, cb_src0_config);
+
+    uint32_t ouput_cb_index = 16;
+    uint32_t num_output_tiles = 1;
+    CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
+        .set_page_size(ouput_cb_index, single_tile_size);
+    auto cb_output = CreateCircularBuffer(program, core, cb_output_config);
+
+    auto flatten_kernel = CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp",
+        core,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
+
+    auto unary_writer_kernel = CreateKernel(
+        program,
+        "tt_metal/kernels/dataflow/writer_unary.cpp",
+        core,
+        DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
+
+    vector<uint32_t> compute_kernel_args = {
+        num_tiles * 32
+    };
+
+    auto eltwise_unary_kernel = CreateKernel(
+        program,
+        "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp",
+        core,
+        ComputeConfig{.compile_args = compute_kernel_args}
+    );
+
+    // Inside the loop, run async runtime functions
+    for (int i = 0; i < 1000; i++) {
+        // Create Device Buffers Asynchronously
+        auto src_dram_buffer = CreateBuffer(dram_config);
+        auto dst_dram_buffer = CreateBuffer(dram_config);
+
+        auto dram_src_noc_xy = src_dram_buffer->noc_coordinates();
+        auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();
+        // Create the source vector
+        std::shared_ptr<std::vector<uint32_t>> src_vec = std::make_shared<std::vector<uint32_t>>(create_random_vector_of_bfloat16(
+            dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()));
+
+        std::vector<uint32_t> golden = gold_standard_flatten(*src_vec, {num_tiles_r * 32, num_tiles_c * 32});
+        // Set the runtime args asynchronously
+        std::shared_ptr<RuntimeArgs> writer_runtime_args = std::make_shared<RuntimeArgs>();
+        std::shared_ptr<RuntimeArgs> compute_runtime_args = std::make_shared<RuntimeArgs>();
+        *compute_runtime_args = {
+            src_dram_buffer.get(),
+            (std::uint32_t)dram_src_noc_xy.x,
+            (std::uint32_t)dram_src_noc_xy.y,
+            num_tiles_r,
+            num_tiles_c,
+            num_bytes_per_tensor_row
+        };
+        *writer_runtime_args = {
+            dst_dram_buffer.get(),
+            (std::uint32_t)dram_dst_noc_xy.x,
+            (std::uint32_t)dram_dst_noc_xy.y,
+            num_tiles * 32
+        };
+
+        SetRuntimeArgs(
+            device,
+            detail::GetKernel(program, flatten_kernel),
+            core,
+            compute_runtime_args);
+
+        SetRuntimeArgs(
+            device,
+            detail::GetKernel(program, unary_writer_kernel),
+            core,
+            writer_runtime_args);
+        // Async write input
+        EnqueueWriteBuffer(device->command_queue(), src_dram_buffer, src_vec, false);
+        // Share ownership of buffer with program
+        AssignGlobalBufferToProgram(src_dram_buffer, program);
+        // Main thread gives up ownership of buffer and src data (this is what python does)
+        src_dram_buffer.reset();
+        src_vec.reset();
+        // Queue up program
+        EnqueueProgram(device->command_queue(), program, false);
+        // Blocking read
+        std::vector<uint32_t> result_vec;
+        EnqueueReadBuffer(device->command_queue(), dst_dram_buffer, result_vec, true);
+
+        // Validation of data
+        TT_FATAL(golden.size() == result_vec.size(), "Size mismatch between golden {} and result vec {}.", golden.size(), result_vec.size());
+        pass &= (golden == result_vec);
+
+        if (not pass) {
+            std::cout << "GOLDEN" << std::endl;
+            print_vec_of_uint32_as_packed_bfloat16(golden, num_tiles * 32);
+
+            std::cout << "RESULT" << std::endl;
+            print_vec_of_uint32_as_packed_bfloat16(result_vec, num_tiles * 32);
+        }
+    }
+    return pass;
 }
 
-TEST_F(CommonFixture, Flatten){
+}
+
+TEST_F(DispatchFixture, TensixFlatten){
     // TODO: Re-enable when #7264 is fixed
     GTEST_SKIP();
     uint32_t num_tiles_r = 2;
     uint32_t num_tiles_c = 2;
-    if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){
+    if (!this->IsSlowDispatch()){
         log_info(LogTest, "Flatten running with num_tiles_r=1, num_tiles_c=1");
         num_tiles_r = 1;
         num_tiles_c = 1;
     }
     for (unsigned int id=0; id < devices_.size(); id++){
         // TODO: #6097, fix this for fast dispatch remote device.
-        if (!this->slow_dispatch_ && id > 0)
+        if (!this->IsSlowDispatch() && id > 0)
             continue;
-        ASSERT_TRUE(gtest_smoke::test_flatten::flatten(this, devices_.at(id), num_tiles_r, num_tiles_c));
+        ASSERT_TRUE(test_flatten::flatten(this, this->devices_.at(id), num_tiles_r, num_tiles_c));
+    }
+}
+
+TEST_F(CommandQueueProgramFixture, DISABLED_TensixTestAsyncFlattenStress) {
+    auto &command_queue = this->device_->command_queue();
+    auto current_mode = CommandQueue::default_mode();
+    command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC);
+    uint32_t num_tiles_r = 2;
+    uint32_t num_tiles_c = 2;
+    if (!this->IsSlowDispatch()) {
+        num_tiles_r = 1;
+        num_tiles_c = 1;
     }
+    ASSERT_TRUE(test_flatten::flatten_stress(this->device_, num_tiles_r, num_tiles_c));
+    command_queue.set_mode(current_mode);
 }
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
rename to tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp
index 06cd4a16177..2ed6a72cf0e 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/compute/sfpu/sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp
@@ -6,16 +6,12 @@
 #include <math.h>
 
 #include <algorithm>
-#include <functional>
-#include <random>
 
 #include "command_queue_fixture.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
 #include "tt_metal/test_utils/stimulus.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
@@ -23,7 +19,6 @@ using std::map;
 using std::vector;
 using namespace tt;
 using namespace tt::test_utils;
-using namespace tt::test_utils::df;
 using namespace tt::tt_metal;
 
 namespace unit_tests::sfpu_util {
@@ -164,12 +159,12 @@ bool run_sfpu_all_same_buffer(CommandQueue & cq, const SfpuConfig& test_config)
     };
 
     for (const CoreRange& core_range : test_config.cores.ranges()) {
-        tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(byte_size, {{0, test_config.l1_input_data_format}})
-            .set_page_size(0, test_config.tile_byte_size);
+        tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(byte_size, {{tt::CBIndex::c_0, test_config.l1_input_data_format}})
+            .set_page_size(tt::CBIndex::c_0, test_config.tile_byte_size);
         auto l1_input_cb = tt_metal::CreateCircularBuffer(program, core_range, l1_input_cb_config);
 
-        tt_metal::CircularBufferConfig l1_output_cb_config = tt_metal::CircularBufferConfig(byte_size, {{16, test_config.l1_output_data_format}})
-            .set_page_size(16, test_config.tile_byte_size);
+        tt_metal::CircularBufferConfig l1_output_cb_config = tt_metal::CircularBufferConfig(byte_size, {{tt::CBIndex::c_16, test_config.l1_output_data_format}})
+            .set_page_size(tt::CBIndex::c_16, test_config.tile_byte_size);
         auto l1_output_cb = tt_metal::CreateCircularBuffer(program, core_range, l1_output_cb_config);
 
         auto reader_kernel = tt_metal::CreateKernel(
@@ -233,7 +228,7 @@ bool run_sfpu_all_same_buffer(CommandQueue & cq, const SfpuConfig& test_config)
 class SingleCoreSingleCardSfpuParameterizedFixture : public CommandQueueSingleCardFixture,
                                                        public testing::WithParamInterface<std::tuple<size_t, string>> {
 };
-TEST_P(SingleCoreSingleCardSfpuParameterizedFixture, SfpuCompute) {
+TEST_P(SingleCoreSingleCardSfpuParameterizedFixture, TensixSfpuCompute) {
     for (Device* device_: devices_) {
         size_t num_tiles = std::get<0>(GetParam());
         string sfpu_op = std::get<1>(GetParam());
@@ -279,7 +274,7 @@ class SingleCoreSingleCardSfpuParameterizedApproxFixture
     : public CommandQueueSingleCardFixture,
       public testing::WithParamInterface<std::tuple<size_t, string>> {};
 
-TEST_P(SingleCoreSingleCardSfpuParameterizedApproxFixture, SfpuCompute) {
+TEST_P(SingleCoreSingleCardSfpuParameterizedApproxFixture, TensixSfpuCompute) {
     for (Device* device_: devices_) {
         size_t num_tiles = std::get<0>(GetParam());
         string sfpu_op = std::get<1>(GetParam());
@@ -326,7 +321,7 @@ class MultiCoreSingleCardSfpuParameterizedApproxFixture
     : public CommandQueueSingleCardFixture,
       public testing::WithParamInterface<std::tuple<size_t, string>> {};
 
-TEST_P(MultiCoreSingleCardSfpuParameterizedApproxFixture, AllCoreMultiTileSfpuApproxCompute) {
+TEST_P(MultiCoreSingleCardSfpuParameterizedApproxFixture, TensixAllCoreMultiTileSfpuApproxCompute) {
     for (Device* device_: devices_) {
 
         size_t num_tiles = std::get<0>(GetParam());
diff --git a/tests/tt_metal/tt_metal/llk/CMakeLists.txt b/tests/tt_metal/tt_metal/llk/CMakeLists.txt
new file mode 100644
index 00000000000..e2b41060099
--- /dev/null
+++ b/tests/tt_metal/tt_metal/llk/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(UNIT_TESTS_LLK_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_broadcast.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_copy_block_matmul_partials.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_cumsum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_dropout_sfpu_compute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_golden_impls.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_reconfig.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_reduce.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_sfpu_compute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_single_core_binary_compute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_single_core_matmul_compute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_transpose.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_untilize_tilize.cpp
+)
+
+add_executable(unit_tests_llk ${UNIT_TESTS_LLK_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_llk)
+
+target_link_libraries(unit_tests_llk PUBLIC test_metal_common_libs)
+target_include_directories(
+    unit_tests_llk
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+set_target_properties(
+    unit_tests_llk
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp b/tests/tt_metal/tt_metal/llk/test_broadcast.cpp
similarity index 98%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
rename to tests/tt_metal/tt_metal/llk/test_broadcast.cpp
index 43963dc422e..5642aa3dfed 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_broadcast.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_broadcast.cpp
@@ -289,11 +289,11 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig&
 }
 }
 
-class BroadcastParametrizedDeviceFixture : public DeviceFixture,
+class BroadcastParameterizedDeviceFixture : public DeviceFixture,
                                            public testing::WithParamInterface<unit_tests::compute::broadcast::BroadcastConfig> {
 };
 
-TEST_P(BroadcastParametrizedDeviceFixture, ComputeSingleTileBroadcast) {
+TEST_P(BroadcastParameterizedDeviceFixture, TensixComputeSingleTileBroadcast) {
     unit_tests::compute::broadcast::BroadcastConfig test_config = GetParam();
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
@@ -307,7 +307,7 @@ using namespace unit_tests::compute::broadcast;
 
 INSTANTIATE_TEST_SUITE_P(
     ComputeSingleTileBroadcast,
-    BroadcastParametrizedDeviceFixture,
+    BroadcastParameterizedDeviceFixture,
     ::testing::Values(
         (BroadcastConfig){ApiConvention::DEFAULT,    EltwiseOp::ADD, BroadcastDim::ROW},
         (BroadcastConfig){ApiConvention::DEFAULT,    EltwiseOp::ADD, BroadcastDim::COL},
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp
similarity index 98%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp
rename to tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp
index 54b747da19a..8fae6bd2f93 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_copy_block_matmul_partials.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp
@@ -170,7 +170,7 @@ void run_single_core_copy_block_matmul_partials(tt_metal::Device* device, const
 // - matmul_pack_tile
 ////////////////////////////////////////////////////////////////////////////
 
-TEST_F(DeviceFixture, DISABLED_ComputeCopyBlockSingle) {
+TEST_F(DeviceFixture, DISABLED_TensixComputeCopyBlockSingle) {
     for (bool fp32_dest_acc_en : {true, false}) {
         // FP32 dest acc not possible for GS
         if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
@@ -185,7 +185,7 @@ TEST_F(DeviceFixture, DISABLED_ComputeCopyBlockSingle) {
         }
     }
 }
-TEST_F(DeviceFixture, ComputeCopyBlockMultiple) {
+TEST_F(DeviceFixture, TensixComputeCopyBlockMultiple) {
     for (bool fp32_dest_acc_en : {true, false}) {
         // FP32 dest acc not possible for GS
         if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
@@ -204,7 +204,7 @@ TEST_F(DeviceFixture, ComputeCopyBlockMultiple) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeCopyBlockComputeBottleneck) {
+TEST_F(DeviceFixture, TensixComputeCopyBlockComputeBottleneck) {
     for (bool fp32_dest_acc_en : {true, false}) {
         // FP32 dest acc not possible for GS
         if ((fp32_dest_acc_en == true) && (this->arch_ == tt::ARCH::GRAYSKULL)) continue;
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp b/tests/tt_metal/tt_metal/llk/test_cumsum.cpp
similarity index 98%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp
rename to tests/tt_metal/tt_metal/llk/test_cumsum.cpp
index 66119879e6c..5bcd766d02c 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_cumsum.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_cumsum.cpp
@@ -179,7 +179,7 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c
 }
 }
 
-TEST_F(DeviceFixture, ComputeCumsumColumnwise) {
+TEST_F(DeviceFixture, TensixComputeCumsumColumnwise) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP(); // Not implemented for GRAYSKULL
@@ -201,7 +201,7 @@ TEST_F(DeviceFixture, ComputeCumsumColumnwise) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeCumsumRowwise) {
+TEST_F(DeviceFixture, TensixComputeCumsumRowwise) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP(); // Not implemented for GRAYSKULL
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp
similarity index 98%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp
rename to tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp
index 655aeb87cfe..b8ea88df903 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_dropout_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp
@@ -90,12 +90,12 @@ bool test_dropout_standalone(tt_metal::Device* device, float probability, uint32
          * Use circular buffers to set input and output buffers that the
          * compute engine will use.
          */
-        constexpr uint32_t src0_cb_index = CB::c_in0;
+        constexpr uint32_t src0_cb_index = CBIndex::c_0;
         constexpr uint32_t num_input_tiles = 2;
         CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
         CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        constexpr uint32_t output_cb_index = CB::c_out0;
+        constexpr uint32_t output_cb_index = CBIndex::c_16;
         constexpr uint32_t num_output_tiles = 2;
         CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
         CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
@@ -239,7 +239,7 @@ void test_dropout(tt_metal::Device* device, const DropoutConfig& test_config) {
 
 }
 
-TEST_F(DeviceFixture, ComputeDropout) {
+TEST_F(DeviceFixture, TensixComputeDropout) {
     if (this->arch_ != tt::ARCH::WORMHOLE_B0) {
         GTEST_SKIP();
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp b/tests/tt_metal/tt_metal/llk/test_golden_impls.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.cpp
rename to tests/tt_metal/tt_metal/llk/test_golden_impls.cpp
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp b/tests/tt_metal/tt_metal/llk/test_golden_impls.hpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_golden_impls.hpp
rename to tests/tt_metal/tt_metal/llk/test_golden_impls.hpp
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp
similarity index 99%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
rename to tests/tt_metal/tt_metal/llk/test_reconfig.cpp
index b55c6329938..df7f9810809 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reconfig.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp
@@ -324,7 +324,7 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c
 // - pack_reconfig_l1_acc
 ////////////////////////////////////////////////////////////////////////////
 
-TEST_F(DeviceFixture, TileCopyReconfigExplicitSplitDstAcc) {
+TEST_F(DeviceFixture, TensixTileCopyReconfigExplicitSplitDstAcc) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
@@ -363,7 +363,7 @@ TEST_F(DeviceFixture, TileCopyReconfigExplicitSplitDstAcc) {
     }
 }
 
-TEST_F(DeviceFixture, TileCopyReconfigL1Acc) {
+TEST_F(DeviceFixture, TensixTileCopyReconfigL1Acc) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp b/tests/tt_metal/tt_metal/llk/test_reduce.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
rename to tests/tt_metal/tt_metal/llk/test_reduce.cpp
index 926af4510f7..02366a9d6f8 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_reduce.cpp
@@ -279,14 +279,14 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
         .set_page_size(src0_cb_index, single_tile_bytes);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_buffer_tiles = 32;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_buffer_tiles * single_tile_bytes, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_bytes);
     auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
 
-    tt_metal::CircularBufferConfig cb_temp_reduce_tile_config = tt_metal::CircularBufferConfig(2 * single_tile_bytes, {{CB::c_in2, tt::DataFormat::Float16_b}})
-        .set_page_size(CB::c_in2, single_tile_bytes);
+    tt_metal::CircularBufferConfig cb_temp_reduce_tile_config = tt_metal::CircularBufferConfig(2 * single_tile_bytes, {{CBIndex::c_2, tt::DataFormat::Float16_b}})
+        .set_page_size(CBIndex::c_2, single_tile_bytes);
     auto cb_temp_reduce_tile = tt_metal::CreateCircularBuffer(program, core, cb_temp_reduce_tile_config);
 
     add_reader_writer_kernels(program, core, test_config, src_dram_buffer, dst_dram_buffer);
@@ -384,7 +384,7 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
 
 using namespace unit_tests::compute::reduce;
 
-TEST_F(DeviceFixture, ComputeReduceH) {
+TEST_F(DeviceFixture, TensixComputeReduceH) {
     if (this->arch_ != tt::ARCH::BLACKHOLE) {
         // (issue #10181: disabling due to sporadic failures in slow dispatch mode)
         GTEST_SKIP();
@@ -422,7 +422,7 @@ TEST_F(DeviceFixture, ComputeReduceH) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeReduceW) {
+TEST_F(DeviceFixture, TensixComputeReduceW) {
     std::vector<uint32_t> shape = {1, 3, 17*TILE_HEIGHT, 19*TILE_WIDTH};
     std::vector<uint32_t> result_shape = {shape[0], shape[1], shape[2], 32};
     for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) {
@@ -457,7 +457,7 @@ TEST_F(DeviceFixture, ComputeReduceW) {
     }
 }
 // Disabled due to GH issue #14510
-TEST_F(DeviceFixture, DISABLED_ComputeReduceHW) {
+TEST_F(DeviceFixture, DISABLED_TensixComputeReduceHW) {
     std::vector<uint32_t> shape = {1, 2, 7*TILE_HEIGHT, 5*TILE_WIDTH};
     std::vector<uint32_t> result_shape = {shape[0], shape[1], 32, 32};
     for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) {
@@ -493,7 +493,7 @@ TEST_F(DeviceFixture, DISABLED_ComputeReduceHW) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeReduceHMathOnly) {
+TEST_F(DeviceFixture, TensixComputeReduceHMathOnly) {
     if (this->arch_ != tt::ARCH::BLACKHOLE) {
         // (issue #10181: disabling due to sporadic failures in slow dispatch mode)
         GTEST_SKIP();
@@ -532,7 +532,7 @@ TEST_F(DeviceFixture, ComputeReduceHMathOnly) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeReduceWMathOnly) {
+TEST_F(DeviceFixture, TensixComputeReduceWMathOnly) {
     std::vector<uint32_t> shape = {1, 3, 17*TILE_HEIGHT, 19*TILE_WIDTH};
     std::vector<uint32_t> result_shape = {shape[0], shape[1], shape[2], 32};
     for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) {
@@ -568,7 +568,7 @@ TEST_F(DeviceFixture, ComputeReduceWMathOnly) {
     }
 }
 // Disabled due to GH issue #14510
-TEST_F(DeviceFixture, DISABLED_ComputeReduceHWMathOnly) {
+TEST_F(DeviceFixture, DISABLED_TensixComputeReduceHWMathOnly) {
     std::vector<uint32_t> shape = {1, 2, 7*TILE_HEIGHT, 5*TILE_WIDTH};
     std::vector<uint32_t> result_shape = {shape[0], shape[1], 32, 32};
     for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) {
@@ -605,7 +605,7 @@ TEST_F(DeviceFixture, DISABLED_ComputeReduceHWMathOnly) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeReduceHShortInit) {
+TEST_F(DeviceFixture, TensixComputeReduceHShortInit) {
     if (this->arch_ != tt::ARCH::BLACKHOLE) {
         // (issue #10181: disabling due to sporadic failures in slow dispatch mode)
         GTEST_SKIP();
@@ -644,7 +644,7 @@ TEST_F(DeviceFixture, ComputeReduceHShortInit) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeReduceWShortInit) {
+TEST_F(DeviceFixture, TensixComputeReduceWShortInit) {
     std::vector<uint32_t> shape = {1, 3, 17*TILE_HEIGHT, 19*TILE_WIDTH};
     std::vector<uint32_t> result_shape = {shape[0], shape[1], shape[2], 32};
     for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) {
@@ -680,7 +680,7 @@ TEST_F(DeviceFixture, ComputeReduceWShortInit) {
     }
 }
 // Disabled due to GH issue #14510
-TEST_F(DeviceFixture, DISABLED_ComputeReduceHWShortInit) {
+TEST_F(DeviceFixture, DISABLED_TensixComputeReduceHWShortInit) {
     std::vector<uint32_t> shape = {1, 2, 7*TILE_HEIGHT, 5*TILE_WIDTH};
     std::vector<uint32_t> result_shape = {shape[0], shape[1], 32, 32};
     for (uint8_t math_fid = uint8_t(MathFidelity::LoFi); math_fid <= uint8_t(MathFidelity::HiFi4); math_fid++) {
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
rename to tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp
index 35ffb316d01..b8ab4c42123 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_sfpu_compute.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp
@@ -166,12 +166,12 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
     };
 
     for (const CoreRange& core_range : test_config.cores.ranges()) {
-        tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(byte_size, {{0, test_config.l1_input_data_format}})
-            .set_page_size(0, test_config.tile_byte_size);
+        tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(byte_size, {{tt::CBIndex::c_0, test_config.l1_input_data_format}})
+            .set_page_size(tt::CBIndex::c_0, test_config.tile_byte_size);
         auto l1_input_cb = tt_metal::CreateCircularBuffer(program, core_range, l1_input_cb_config);
 
-        tt_metal::CircularBufferConfig l1_output_cb_config = tt_metal::CircularBufferConfig(byte_size, {{16, test_config.l1_output_data_format}})
-            .set_page_size(16, test_config.tile_byte_size);
+        tt_metal::CircularBufferConfig l1_output_cb_config = tt_metal::CircularBufferConfig(byte_size, {{tt::CBIndex::c_16, test_config.l1_output_data_format}})
+            .set_page_size(tt::CBIndex::c_16, test_config.tile_byte_size);
         auto l1_output_cb = tt_metal::CreateCircularBuffer(program, core_range, l1_output_cb_config);
 
         auto reader_kernel = tt_metal::CreateKernel(
@@ -228,7 +228,7 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c
 class SingleCoreSingleDeviceSfpuParameterizedFixture : public DeviceFixture,
                                                        public testing::WithParamInterface<std::tuple<size_t, string>> {
 };
-TEST_P(SingleCoreSingleDeviceSfpuParameterizedFixture, SfpuCompute) {
+TEST_P(SingleCoreSingleDeviceSfpuParameterizedFixture, TensixSfpuCompute) {
     size_t num_tiles = std::get<0>(GetParam());
     string sfpu_op = std::get<1>(GetParam());
 
@@ -272,7 +272,7 @@ class SingleCoreSingleDeviceSfpuParameterizedApproxFixture
     : public DeviceFixture,
       public testing::WithParamInterface<std::tuple<size_t, string>> {};
 
-TEST_P(SingleCoreSingleDeviceSfpuParameterizedApproxFixture, SfpuCompute) {
+TEST_P(SingleCoreSingleDeviceSfpuParameterizedApproxFixture, TensixSfpuCompute) {
     size_t num_tiles = std::get<0>(GetParam());
     string sfpu_op = std::get<1>(GetParam());
 
@@ -318,7 +318,7 @@ INSTANTIATE_TEST_SUITE_P(
         std::make_tuple(4, "log"),
         std::make_tuple(4, "tanh")));
 
-TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreSingleTileSfpuApproxCompute) {
+TEST_F(DeviceFixture, DISABLED_TensixMultiContinguousCoreSingleTileSfpuApproxCompute) {
     CoreRange core_range({0, 0}, {1, 0});
     CoreRangeSet core_range_set({core_range});
     unit_tests::compute::sfpu::SfpuConfig test_config = {
@@ -356,7 +356,7 @@ TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreSingleTileSfpuApproxCompute)
     EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
 }
 
-TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreMultiTileSfpuApproxCompute) {
+TEST_F(DeviceFixture, DISABLED_TensixMultiContinguousCoreMultiTileSfpuApproxCompute) {
     CoreRange core_range({0, 0}, {1, 0});
     CoreRangeSet core_range_set({core_range});
     unit_tests::compute::sfpu::SfpuConfig test_config = {
@@ -394,7 +394,7 @@ TEST_F(DeviceFixture, DISABLED_MultiContinguousCoreMultiTileSfpuApproxCompute) {
     test_config.sfpu_op = "tanh";
     EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
 }
-TEST_F(DeviceFixture, DISABLED_AllCoreSingleTileSfpuApproxCompute) {
+TEST_F(DeviceFixture, DISABLED_TensixAllCoreSingleTileSfpuApproxCompute) {
     unit_tests::compute::sfpu::SfpuConfig test_config = {
         .tile_byte_size = 2 * 32 * 32,
         .l1_input_data_format = tt::DataFormat::Float16_b,
@@ -433,7 +433,7 @@ TEST_F(DeviceFixture, DISABLED_AllCoreSingleTileSfpuApproxCompute) {
     test_config.sfpu_op = "tanh";
     EXPECT_TRUE(run_sfpu_all_same_buffer(devices_.at(0), test_config));
 }
-TEST_F(DeviceFixture, DISABLED_AllCoreMultiTileSfpuApproxCompute) {
+TEST_F(DeviceFixture, DISABLED_TensixAllCoreMultiTileSfpuApproxCompute) {
     unit_tests::compute::sfpu::SfpuConfig test_config = {
         .tile_byte_size = 2 * 32 * 32,
         .l1_input_data_format = tt::DataFormat::Float16_b,
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp
similarity index 95%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp
rename to tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp
index 3be28d9843e..721daa15c22 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_binary_compute.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp
@@ -278,7 +278,7 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig&
 }
 }  // namespace unit_tests::compute::binary
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileAdd) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileAdd) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -296,7 +296,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileAdd) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileSub) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileSub) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -314,7 +314,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileSub) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileMul) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileMul) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -332,7 +332,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileMul) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileAddFullInit) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileAddFullInit) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -351,7 +351,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileAddFullInit) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileSubFullInit) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileSubFullInit) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -370,7 +370,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileSubFullInit) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileMulFullInit) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreSingleTileMulFullInit) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -389,7 +389,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreSingleTileMulFullInit) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAddWithDestReuse) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileAddWithDestReuse) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -407,7 +407,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAddWithDestReuse) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSubWithDestReuse) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileSubWithDestReuse) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -425,7 +425,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSubWithDestReuse) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMulWithDestReuse) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileMulWithDestReuse) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -443,7 +443,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMulWithDestReuse) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAdd) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileAdd) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -461,7 +461,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAdd) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSub) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileSub) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -479,7 +479,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSub) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMul) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileMul) {
     for (uint8_t i = uint8_t(MathFidelity::LoFi); i <= uint8_t(MathFidelity::HiFi4); i++) {
         if (i == 1) continue;
         unit_tests::compute::binary::SingleCoreBinaryConfig test_config = {
@@ -497,7 +497,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMul) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAddDestAcc) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileAddDestAcc) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
@@ -521,7 +521,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileAddDestAcc) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSubDestAcc) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileSubDestAcc) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
@@ -545,7 +545,7 @@ TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileSubDestAcc) {
     }
 }
 
-TEST_F(DeviceFixture, BinaryComputeSingleCoreMultiTileMulDestAcc) {
+TEST_F(DeviceFixture, TensixBinaryComputeSingleCoreMultiTileMulDestAcc) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp
similarity index 98%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp
rename to tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp
index 140874255df..df5583ecf29 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_single_core_matmul_compute.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp
@@ -604,22 +604,22 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N
 }
 }  // namespace unit_tests::compute::matmul
 
-TEST_F(DeviceFixture, TestSingleCoreSingleTileComputeMatmul) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleTileComputeMatmul) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         ASSERT_TRUE(unit_tests::compute::matmul::single_tile_matmul(this->devices_.at(id)));
     }
 }
-TEST_F(DeviceFixture, TestSingleCoreSingleBlockSingleTileComputeMatmul) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleBlockSingleTileComputeMatmul) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         ASSERT_TRUE(unit_tests::compute::matmul::single_block_matmul(this->devices_.at(id), 1, 1, 1));
     }
 }
-TEST_F(DeviceFixture, TestSingleCoreSingleBlockSingleTileAccumulationComputeMatmul) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleBlockSingleTileAccumulationComputeMatmul) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         ASSERT_TRUE(unit_tests::compute::matmul::single_block_matmul(this->devices_.at(id), 1, 2, 1));
     }
 }
-TEST_F(DeviceFixture, TestSingleCoreSingleBlockSingleTileNoAccumulationComputeMatmul) {
+TEST_F(DeviceFixture, TensixTestSingleCoreSingleBlockSingleTileNoAccumulationComputeMatmul) {
     for (unsigned int id = 0; id < num_devices_; id++) {
         ASSERT_TRUE(unit_tests::compute::matmul::single_block_matmul(this->devices_.at(id), 2, 1, 2));
     }
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp b/tests/tt_metal/tt_metal/llk/test_transpose.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp
rename to tests/tt_metal/tt_metal/llk/test_transpose.cpp
index 26fcf5069d0..76fcf53ed9d 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_transpose.cpp
@@ -110,7 +110,7 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig&
         .set_page_size(src0_cb_index, test_config.single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_buffer_tiles = 32;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_buffer_tiles * test_config.single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, test_config.single_tile_size);
@@ -187,7 +187,7 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig&
 
 } // namespace unit_tests::compute::transpose
 
-TEST_F(DeviceFixture, ComputeTransposeWH) {
+TEST_F(DeviceFixture, TensixComputeTransposeWH) {
     unit_tests::compute::transpose::TransposeConfig test_config = {
         .short_init = false,
         .single_tile_size = 2 * 1024,
@@ -196,7 +196,7 @@ TEST_F(DeviceFixture, ComputeTransposeWH) {
     unit_tests::compute::transpose::run_single_core_transpose(this->devices_.at(0), test_config);
 }
 
-TEST_F(DeviceFixture, ComputeTransposeWHShortInit) {
+TEST_F(DeviceFixture, TensixComputeTransposeWHShortInit) {
     unit_tests::compute::transpose::TransposeConfig test_config = {
         .short_init = true,
         .single_tile_size = 2 * 1024,
diff --git a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp
similarity index 97%
rename from tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
rename to tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp
index c3add81f771..561e3e92031 100644
--- a/tests/tt_metal/tt_metal/unit_tests/compute/test_untilize_tilize.cpp
+++ b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp
@@ -95,7 +95,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
     CoreCoord dram_src0_noc_xy = src0_dram_buffer->noc_coordinates();
     CoreCoord dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = num_tiles;
     tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * test_config.input_single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(src0_cb_index, test_config.input_single_tile_size);
@@ -110,14 +110,14 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
         dram_buffer_src1_addr = src1_dram_buffer->address();
         dram_src1_noc_xy = src1_dram_buffer->noc_coordinates();
 
-        uint32_t src1_cb_index = tt::CB::c_in1;
+        uint32_t src1_cb_index = tt::CBIndex::c_1;
         uint32_t num_input_tiles = num_tiles;
         tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * test_config.input_single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(src1_cb_index, test_config.input_single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
     }
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = num_tiles;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(
         num_output_tiles * test_config.output_single_tile_size,
@@ -307,7 +307,7 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig&
 Following tests are for Unpack Tilize
 ***************************************/
 
-TEST_F(DeviceFixture, ComputeUnpackTilize) {
+TEST_F(DeviceFixture, TensixComputeUnpackTilize) {
     vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -330,7 +330,7 @@ TEST_F(DeviceFixture, ComputeUnpackTilize) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) {
+TEST_F(DeviceFixture, TensixComputeUnpackTilizeA_B) {
     auto arch = this->arch_;
     if (arch == tt::ARCH::GRAYSKULL) {
         GTEST_SKIP();
@@ -349,7 +349,7 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeA_B) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeUnpackTilizeShortInit) {
+TEST_F(DeviceFixture, TensixComputeUnpackTilizeShortInit) {
     vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -377,7 +377,7 @@ TEST_F(DeviceFixture, ComputeUnpackTilizeShortInit) {
 Following tests are for Unpack Untilize
 ***************************************/
 
-TEST_F(DeviceFixture, ComputeUnpackUntilize) {
+TEST_F(DeviceFixture, TensixComputeUnpackUntilize) {
     vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -400,7 +400,7 @@ TEST_F(DeviceFixture, ComputeUnpackUntilize) {
     }
 }
 
-TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) {
+TEST_F(DeviceFixture, TensixComputeUnpackUntilizeShortInit) {
     vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -427,7 +427,7 @@ TEST_F(DeviceFixture, ComputeUnpackUntilizeShortInit) {
 /**************************************
 Following tests are for pack untilize
 ***************************************/
-TEST_F(DeviceFixture, ComputePackUntilize) {
+TEST_F(DeviceFixture, TensixComputePackUntilize) {
     vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -450,7 +450,7 @@ TEST_F(DeviceFixture, ComputePackUntilize) {
     }
 }
 
-TEST_F(DeviceFixture, ComputePackUntilizeShortInit) {
+TEST_F(DeviceFixture, TensixComputePackUntilizeShortInit) {
     vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
         for (bool fp32_dest_acc_en : {true, false}) {
@@ -474,7 +474,7 @@ TEST_F(DeviceFixture, ComputePackUntilizeShortInit) {
     }
 }
 
-TEST_F(DeviceFixture, ComputePackUntilizeDst) {
+TEST_F(DeviceFixture, TensixComputePackUntilizeDst) {
     vector<vector<uint32_t> > num_tiles = {{1, 1}, {1, 2}, {2, 1}, {1, 4}, {2, 2}, {4, 1}};
     for(auto num_tile : num_tiles) {
         for (bool dst_full_sync_en : {true, false}) {
@@ -495,7 +495,7 @@ TEST_F(DeviceFixture, ComputePackUntilizeDst) {
 //Tests pack_untilize with tiny tile dims.
 //Row dim 1x32, which is faces = 2, rows = 1
 //Row dim 1x16, which is faces = 1, rows = 1
-TEST_F(DeviceFixture, ComputePackUntilizeDstTinyTile) {
+TEST_F(DeviceFixture, TensixComputePackUntilizeDstTinyTile) {
     vector<vector<uint32_t> > test_config_values = {{1, 1, 1, 1}, {1, 1, 2, 1}, {1, 2, 2, 1}};
     uint32_t face_c_dim = 16;
     for(auto test_config_value : test_config_values) {
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
index f914d3ca87b..39cf7eaa5af 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
@@ -22,7 +22,7 @@
 #include "tt_metal/common/work_split.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
 #include <yaml-cpp/yaml.h>
 
 using std::vector;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp
index f3554b99224..58ab1c113c0 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp
@@ -20,10 +20,10 @@ void MAIN {
     constexpr uint32_t out_block_num_tiles = get_compile_time_arg_val(7); // out_subblock_h * out_subblock_w;
     constexpr uint32_t num_layers = get_compile_time_arg_val(8); // untilize output
 
-    constexpr uint32_t in0_cb_id = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id = tt::CB::c_in1;
-    constexpr uint32_t sync_cb_id = tt::CB::c_in2;
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
+    constexpr uint32_t in0_cb_id = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id = tt::CBIndex::c_1;
+    constexpr uint32_t sync_cb_id = tt::CBIndex::c_2;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
 
     for (uint32_t l = 0; l < num_layers; ++l) {
 
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
index 7af8eb29d35..932a66637ef 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
@@ -22,7 +22,7 @@
 #include "tt_metal/common/work_split.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
 
 using std::vector;
 using namespace tt;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp
index eeb4185f796..e8c1837c83a 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation.cpp
@@ -29,12 +29,12 @@ void MAIN {
     uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w;
     uint32_t batch = get_compile_time_arg_val(11); // batch dim
 
-    uint32_t in0_cb_id = tt::CB::c_in0;
-    uint32_t in1_cb_id = tt::CB::c_in1;
-    uint32_t out_cb_id = tt::CB::c_out0;
-    uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
-    uint32_t mm_bias_intermediate_cb_id = tt::CB::c_intermed1;
-    uint32_t bias_cb_id = tt::CB::c_in3;
+    uint32_t in0_cb_id = tt::CBIndex::c_0;
+    uint32_t in1_cb_id = tt::CBIndex::c_1;
+    uint32_t out_cb_id = tt::CBIndex::c_16;
+    uint32_t mm_partials_cb_id = tt::CBIndex::c_24;
+    uint32_t mm_bias_intermediate_cb_id = tt::CBIndex::c_25;
+    uint32_t bias_cb_id = tt::CBIndex::c_3;
 
     #ifdef FUSE_BIAS
         init_bcast<EltwiseBinaryType::ELWADD, BroadcastType::ROW>(mm_bias_intermediate_cb_id, bias_cb_id);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp
index 26380349b70..dce7ad0966a 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp
@@ -94,15 +94,15 @@ void MAIN {
 
     constexpr uint32_t out_block_w = out_subblock_w*in1_num_subblocks;
 
-    constexpr uint32_t in0_cb_id = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id = tt::CB::c_in1;
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
-    constexpr uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
+    constexpr uint32_t in0_cb_id = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id = tt::CBIndex::c_1;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
+    constexpr uint32_t mm_partials_cb_id = tt::CBIndex::c_24;
 
     constexpr uint32_t untilize_mode_out_cb_id = untilize_out ? mm_partials_cb_id : out_cb_id;
 
     #ifdef FUSE_BIAS
-    constexpr uint32_t bias_cb_id = tt::CB::c_in3;
+    constexpr uint32_t bias_cb_id = tt::CBIndex::c_3;
     constexpr uint32_t mm_out_cb_id = mm_partials_cb_id;
     #else
     constexpr uint32_t mm_out_cb_id = untilize_mode_out_cb_id;
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
index d72ac2a08b1..ffd78a3b671 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp
@@ -22,10 +22,10 @@
 #include "tt_metal/common/constants.hpp"
 #include <optional>
 
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
 #include "tt_metal/test_utils/deprecated/tensor.hpp"
 #include "tests/tt_metal/test_utils/tilization.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp"
+#include "tt_metal/tt_metal/common/matmul_test_utils.hpp"
 #include "tt_metal/common/work_split.hpp"
 
 using std::vector;
@@ -938,22 +938,22 @@ tt_metal::Program create_program_single_core (
         {(std::size_t)0, (std::size_t)0}, {(std::size_t)core_range.x - 1, (std::size_t)core_range.y - 1});
 
     // Create circular buffers
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     tt_metal::CircularBufferConfig cb_src0_config =
         tt_metal::CircularBufferConfig(in0_CB_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
             .set_page_size(src0_cb_index, single_tile_size)
             .set_globally_allocated_address(*in0_cb_addr);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t src1_cb_index = 1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     tt_metal::CircularBufferConfig cb_src1_config =
         tt_metal::CircularBufferConfig(in1_CB_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
             .set_page_size(src1_cb_index, single_tile_size)
             .set_globally_allocated_address(*in1_cb_addr);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
 
-    uint32_t out_cb_index = 16;  // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t out_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
 
     if (fp32_dest_acc_en) {
         if (interm_cb_dtype == 1) {
@@ -1123,13 +1123,13 @@ tt_metal::Program create_program(
         {(std::size_t)0, (std::size_t)0}, {(std::size_t)core_range.x - 1, (std::size_t)core_range.y - 1});
 
     // Create circular buffers
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     tt_metal::CircularBufferConfig cb_src0_config =
         tt_metal::CircularBufferConfig(in0_CB_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
             .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t src1_cb_index = 1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     tt_metal::CircularBufferConfig cb_src1_config =
         tt_metal::CircularBufferConfig(in1_CB_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
             .set_page_size(src1_cb_index, single_tile_size);
@@ -1140,14 +1140,14 @@ tt_metal::Program create_program(
     // CB for padding; only need these in the senders
     // NOTE: For first core, initialize cb to the larger tile size to prevent
     // accidentally writing 0 to L1 space during cb init in the kernels
-    uint32_t src2_cb_index = 2;
+    uint32_t src2_cb_index = tt::CBIndex::c_2;
     tt_metal::CircularBufferConfig cb_src2_config =
         tt_metal::CircularBufferConfig(in2_CB_tiles * single_tile_size, {{src2_cb_index, cb_data_format}})
             .set_page_size(src2_cb_index, single_tile_size);
     auto in0_in1_sender_cb_src2 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src2_config);
 
-    uint32_t out_cb_index = 16;  // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t out_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {out_cb_index, cb_data_format}, {interm0_cb_index, cb_data_format}};
     tt_metal::CircularBufferConfig cb_out_config =
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
index ca52fc83771..77c833c47b2 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
@@ -189,7 +189,7 @@ bool RunWriteBWTest(
     ////////////////////////////////////////////////////////////////////////////
     //                  WORKER CB CONFIG
     ////////////////////////////////////////////////////////////////////////////
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
 
     // Just want a dummy DF
     tt::DataFormat df = input_buffer_page_size == 1024 ? tt::DataFormat::Bfp8 :
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp
index 8b57508325c..173b0a8f86e 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/bmm_large_block_zm_fused_bias_activation.cpp
@@ -29,12 +29,12 @@ void MAIN {
     uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w;
     uint32_t batch = get_compile_time_arg_val(11); // batch dim
 
-    uint32_t in0_cb_id = tt::CB::c_in0;
-    uint32_t in1_cb_id = tt::CB::c_in1;
-    uint32_t out_cb_id = tt::CB::c_out0;
-    uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
-    uint32_t mm_bias_intermediate_cb_id = tt::CB::c_intermed1;
-    uint32_t bias_cb_id = tt::CB::c_in3;
+    uint32_t in0_cb_id = tt::CBIndex::c_0;
+    uint32_t in1_cb_id = tt::CBIndex::c_1;
+    uint32_t out_cb_id = tt::CBIndex::c_16;
+    uint32_t mm_partials_cb_id = tt::CBIndex::c_24;
+    uint32_t mm_bias_intermediate_cb_id = tt::CBIndex::c_25;
+    uint32_t bias_cb_id = tt::CBIndex::c_3;
 
     #ifdef FUSE_BIAS
         init_bcast<EltwiseBinaryType::ELWADD, BroadcastType::ROW>(mm_bias_intermediate_cb_id, bias_cb_id);
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp
index 60d3136267d..cf6377c765b 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/kernels/compute_local_l1.cpp
@@ -18,11 +18,11 @@ void MAIN {
         for (uint32_t nt = 0; nt < sub_Nt; ++nt) {
             acquire_dst();
             for (uint32_t kt = 0; kt < Kt; ++kt) {
-                matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, mt * Kt + kt, nt * Kt + kt, 0, false);
+                matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, mt * Kt + kt, nt * Kt + kt, 0, false);
             }
-            cb_reserve_back(tt::CB::c_out0, onetile);
-            pack_tile(0, tt::CB::c_out0);
-            cb_push_back(tt::CB::c_out0, onetile);
+            cb_reserve_back(tt::CBIndex::c_16, onetile);
+            pack_tile(0, tt::CBIndex::c_16);
+            cb_push_back(tt::CBIndex::c_16, onetile);
             release_dst();
         }
     }
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
index fb239fe056b..d75c28a0f58 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp
@@ -518,18 +518,18 @@ tt_metal::Program create_program_mcast_in0_in1(
                               .defines = mm_kernel_defines});
 
   // Create circular buffers
-  uint32_t src0_cb_index = 0;
+  uint32_t src0_cb_index = tt::CBIndex::c_0;
   tt_metal::CircularBufferConfig src_cb0_config = tt_metal::CircularBufferConfig(in0_CB_size, {{src0_cb_index, in0_data_format}})
     .set_page_size(src0_cb_index, in0_single_tile_size);
   auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, src_cb0_config);
 
-  uint32_t src1_cb_index = 1;
+  uint32_t src1_cb_index = tt::CBIndex::c_1;
   tt_metal::CircularBufferConfig src_cb1_config = tt_metal::CircularBufferConfig(in1_CB_size, {{src1_cb_index, in1_data_format}})
     .set_page_size(src1_cb_index, in1_single_tile_size);
   auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, src_cb1_config);
 
-  uint32_t output_cb_index = 16;  // output operands start at index 16
-  uint32_t interm0_cb_index = 24;
+  uint32_t output_cb_index = tt::CBIndex::c_16;
+  uint32_t interm0_cb_index = tt::CBIndex::c_24;
   std::map<uint8_t, tt::DataFormat> interim_and_out_data_format_spec = {
     {output_cb_index, output_data_format},
     {interm0_cb_index, output_data_format}
diff --git a/tests/tt_metal/tt_metal/stl/CMakeLists.txt b/tests/tt_metal/tt_metal/stl/CMakeLists.txt
new file mode 100644
index 00000000000..0f1100b0e6f
--- /dev/null
+++ b/tests/tt_metal/tt_metal/stl/CMakeLists.txt
@@ -0,0 +1,26 @@
+set(UNIT_TESTS_STL_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_any_range.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_slotmap.cpp
+)
+
+add_executable(unit_tests_stl ${UNIT_TESTS_STL_SRC})
+TT_ENABLE_UNITY_BUILD(unit_tests_stl)
+
+target_link_libraries(unit_tests_stl PUBLIC test_metal_common_libs)
+target_include_directories(
+    unit_tests_stl
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${CMAKE_CURRENT_SOURCE_DIR}/common
+)
+set_target_properties(
+    unit_tests_stl
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/test/tt_metal
+)
diff --git a/tests/tt_metal/tt_metal/unit_tests/tt_stl/test_any_range.cpp b/tests/tt_metal/tt_metal/stl/test_any_range.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests/tt_stl/test_any_range.cpp
rename to tests/tt_metal/tt_metal/stl/test_any_range.cpp
diff --git a/tests/tt_metal/tt_metal/unit_tests/tt_stl/slotmap.cpp b/tests/tt_metal/tt_metal/stl/test_slotmap.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests/tt_stl/slotmap.cpp
rename to tests/tt_metal/tt_metal/stl/test_slotmap.cpp
diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp
index d758cf02877..6cbb5be70e4 100644
--- a/tests/tt_metal/tt_metal/test_bcast.cpp
+++ b/tests/tt_metal/tt_metal/test_bcast.cpp
@@ -142,7 +142,7 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_bytes);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_buffer_tiles = 2;
         // this buffer is used in writer_unary.cpp BRISC kernel
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_buffer_tiles * single_tile_bytes, {{ouput_cb_index, tt::DataFormat::Float16_b}})
diff --git a/tests/tt_metal/tt_metal/test_bmm.cpp b/tests/tt_metal/tt_metal/test_bmm.cpp
index f1bb93d0182..4a0f5eae1c1 100644
--- a/tests/tt_metal/tt_metal/test_bmm.cpp
+++ b/tests/tt_metal/tt_metal/test_bmm.cpp
@@ -87,7 +87,7 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 2;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp
index a093bb91fcf..97607f896ba 100644
--- a/tests/tt_metal/tt_metal/test_compile_program.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_program.cpp
@@ -91,8 +91,8 @@ struct ProgramAttributes {
     NOC reader_noc = NOC::RISCV_1_default;
     NOC writer_noc = NOC::RISCV_0_default;
     tt::DataFormat data_format = tt::DataFormat::Float16_b;
-    uint32_t src_cb_index = 0;
-    uint32_t output_cb_index = 16;
+    uint32_t src_cb_index = tt::CBIndex::c_0;
+    uint32_t output_cb_index = tt::CBIndex::c_16;
 };
 
 Program create_program(Device *device, const ProgramAttributes &program_attributes) {
@@ -109,7 +109,6 @@ Program create_program(Device *device, const ProgramAttributes &program_attribut
         .set_page_size(program_attributes.src_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    // output operands start at index 16
     uint32_t num_output_tiles = 1;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{program_attributes.output_cb_index, program_attributes.data_format}})
         .set_page_size(program_attributes.output_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
index f526d16cff2..960bc014316 100644
--- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
+++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
@@ -64,7 +64,7 @@ void construct_program(Program& program, Device * device, CoreCoord& core) {
     // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the
     // input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to
     // math kernel, input CB and reader
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 8;
     tt_metal::CircularBufferConfig cb_src0_config =
         tt_metal::CircularBufferConfig(
@@ -72,7 +72,7 @@ void construct_program(Program& program, Device * device, CoreCoord& core) {
             .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t ouput_cb_index = 16;  // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 1;
     tt_metal::CircularBufferConfig cb_output_config =
         tt_metal::CircularBufferConfig(
diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp
index ec0051f7d0e..8f2891e16e9 100644
--- a/tests/tt_metal/tt_metal/test_core_range_set.cpp
+++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp
@@ -110,13 +110,13 @@ bool test_program_specified_with_core_range_set(tt_metal::Device *device, tt_met
 
     // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input CB
     // CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math kernel, input CB and reader
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 8;
     tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core_range_set, cb_src0_config);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 1;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_datacopy.cpp b/tests/tt_metal/tt_metal/test_datacopy.cpp
index 911916a5d0e..1abd55e17db 100644
--- a/tests/tt_metal/tt_metal/test_datacopy.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy.cpp
@@ -71,13 +71,13 @@ int main(int argc, char **argv) {
 
         // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input CB
         // CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math kernel, input CB and reader
-        uint32_t src0_cb_index = 0;
+        uint32_t src0_cb_index = tt::CBIndex::c_0;
         uint32_t num_input_tiles = 8;
         tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(src0_cb_index, single_tile_size);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t output_cb_addr = 300 * 1024;
         uint32_t num_output_tiles = 1;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
diff --git a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
index 59f9d90fe27..45cc1427f51 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
             .set_page_size(src0_cb_index, single_tile_size);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 1;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Bfp8_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
index d66f5441fc4..70ec17a01ef 100644
--- a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
+++ b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp
@@ -79,13 +79,13 @@ int main(int argc, char **argv) {
 
         // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input CB
         // CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math kernel, input CB and reader
-        uint32_t src0_cb_index = 0;
+        uint32_t src0_cb_index = tt::CBIndex::c_0;
         uint32_t num_input_tiles = 8;
         tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(src0_cb_index, single_tile_size);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 1;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
index 7abd5863f8f..7ed80cc959c 100644
--- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp
@@ -85,18 +85,18 @@ int main(int argc, char** argv) {
             auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates();
             auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();
 
-            uint32_t src0_cb_index = 0;
+            uint32_t src0_cb_index = tt::CBIndex::c_0;
             uint32_t num_input_tiles = 2;
             tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(src0_cb_index, single_tile_size);
             auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-            uint32_t src1_cb_index = 1;
+            uint32_t src1_cb_index = tt::CBIndex::c_1;
             tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(src1_cb_index, single_tile_size);
             auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-            uint32_t ouput_cb_index = 16;  // output operands start at index 16
+            uint32_t ouput_cb_index = tt::CBIndex::c_16;
             uint32_t num_output_tiles = 2;
             tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/test_enqueue_program.cpp
similarity index 98%
rename from tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
rename to tests/tt_metal/tt_metal/test_enqueue_program.cpp
index 5b3f3cd7851..7eedfc6f359 100644
--- a/tests/tt_metal/tt_metal/tt_dispatch/test_enqueue_program.cpp
+++ b/tests/tt_metal/tt_metal/test_enqueue_program.cpp
@@ -49,7 +49,7 @@ tt_metal::Program generate_eltwise_unary_program(Device *device) {
         .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, src_cb_config);
 
-    uint32_t ouput_cb_index = 16;  // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_flatten.cpp b/tests/tt_metal/tt_metal/test_flatten.cpp
index 6322da0dc4e..87fc6f4c0bf 100644
--- a/tests/tt_metal/tt_metal/test_flatten.cpp
+++ b/tests/tt_metal/tt_metal/test_flatten.cpp
@@ -117,13 +117,13 @@ int main(int argc, char **argv) {
 
         // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input CB
         // CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math kernel, input CB and reader
-        uint32_t src0_cb_index = 0;
+        uint32_t src0_cb_index = tt::CBIndex::c_0;
         uint32_t num_input_tiles = 8;
         tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(src0_cb_index, single_tile_size);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 1;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
index ee218eab250..9b01bbac6ff 100644
--- a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp
@@ -205,7 +205,7 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t interm0_cb_index = 24;
         uint32_t num_output_tiles = M * N;
         CoreRangeSet cores(std::set<CoreRange>{CoreRange(core, core)});
diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
index c5677750107..3b757ec3039 100644
--- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
+++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp
@@ -136,13 +136,13 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt:
 
         // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input CB
         // CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math kernel, input CB and reader
-        uint32_t src0_cb_index = 0;
+        uint32_t src0_cb_index = tt::CBIndex::c_0;
         uint32_t num_input_tiles = num_tiles_c;
         tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(src0_cb_index, single_tile_size);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
         auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
@@ -315,7 +315,7 @@ bool interleaved_tilized_reader_interleaved_stick_writer_datacopy_test(const tt:
             .set_page_size(src0_cb_index, single_tile_size);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
         auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
diff --git a/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp b/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp
deleted file mode 100644
index aceb624577e..00000000000
--- a/tests/tt_metal/tt_metal/test_kernel_path_env_var.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-
-#include <exception>
-#include <filesystem>
-
-#include "assert.hpp"
-#include "core_coord.hpp"
-#include "detail/tt_metal.hpp"
-#include "host_api.hpp"
-#include "impl/kernels/data_types.hpp"
-#include "impl/program/program.hpp"
-#include "llrt/rtoptions.hpp"
-#include "tt_cluster_descriptor_types.h"
-
-using namespace tt;
-using namespace tt::tt_metal;
-using namespace tt::llrt;
-
-class CompileProgramWithKernelPathEnvVarFixture : public ::testing::Test {
-   protected:
-    void SetUp() override {
-        this->validate_preconditions();
-
-        const chip_id_t device_id = 0;
-        this->device_ = CreateDevice(device_id);
-        this->program_ = CreateProgram();
-    }
-
-    void TearDown() override { CloseDevice(this->device_); }
-
-    void create_kernel(const string &kernel_file) {
-        CoreCoord core(0, 0);
-        tt_metal::CreateKernel(
-            this->program_,
-            kernel_file,
-            core,
-            tt_metal::DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});
-    }
-
-    void setup_kernel_dir(const string &orig_kernel_file, const string &new_kernel_file) {
-        const string &kernel_dir = OptionsG.get_kernel_dir();
-        const std::filesystem::path &kernel_file_path_under_kernel_dir(kernel_dir + new_kernel_file);
-        const std::filesystem::path &dirs_under_kernel_dir = kernel_file_path_under_kernel_dir.parent_path();
-        std::filesystem::create_directories(dirs_under_kernel_dir);
-
-        const string &metal_root = OptionsG.get_root_dir();
-        const std::filesystem::path &kernel_file_path_under_metal_root(metal_root + orig_kernel_file);
-        std::filesystem::copy(kernel_file_path_under_metal_root, kernel_file_path_under_kernel_dir);
-    }
-
-    void cleanup_kernel_dir() {
-        const string &kernel_dir = OptionsG.get_kernel_dir();
-        for (const std::filesystem::directory_entry &entry : std::filesystem::directory_iterator(kernel_dir)) {
-            std::filesystem::remove_all(entry);
-        }
-    }
-
-    Device *device_;
-    Program program_;
-
-   private:
-    void validate_preconditions() {
-        this->validate_env_vars_are_set();
-        this->validate_kernel_dir_is_valid();
-    }
-
-    void validate_env_vars_are_set() {
-        if (!OptionsG.is_root_dir_specified()) {
-            GTEST_SKIP() << "Skipping test: TT_METAL_HOME must be set";
-        }
-        if (!OptionsG.is_kernel_dir_specified()) {
-            GTEST_SKIP() << "Skipping test: TT_METAL_KERNEL_PATH must be set";
-        }
-    }
-
-    void validate_kernel_dir_is_valid() {
-        const string &kernel_dir = llrt::OptionsG.get_kernel_dir();
-        if (!this->does_path_exist(kernel_dir) || !this->is_path_a_directory(kernel_dir) ||
-            !this->is_dir_empty(kernel_dir)) {
-            GTEST_SKIP() << "Skipping test: TT_METAL_KERNEL_PATH must be an existing, empty directory";
-        }
-    }
-
-    bool does_path_exist(const string &path) {
-        const std::filesystem::path &file_path(path);
-        return std::filesystem::exists(file_path);
-    }
-
-    bool is_path_a_directory(const string &path) {
-        TT_FATAL(this->does_path_exist(path), "{} does not exist", path);
-        const std::filesystem::path &file_path(path);
-        return std::filesystem::is_directory(file_path);
-    }
-
-    bool is_dir_empty(const string &path) {
-        TT_FATAL(this->does_path_exist(path), "{} does not exist", path);
-        TT_FATAL(this->is_path_a_directory(path), "{} is not a directory", path);
-        const std::filesystem::path &file_path(path);
-        return std::filesystem::is_empty(file_path);
-    }
-};
-
-TEST_F(CompileProgramWithKernelPathEnvVarFixture, KernelUnderMetalRootDir) {
-    const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp";
-    create_kernel(kernel_file);
-    detail::CompileProgram(this->device_, this->program_);
-}
-
-TEST_F(CompileProgramWithKernelPathEnvVarFixture, KernelUnderKernelRootDir) {
-    const string &orig_kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp";
-    const string &new_kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/new_kernel.cpp";
-    this->setup_kernel_dir(orig_kernel_file, new_kernel_file);
-    this->create_kernel(new_kernel_file);
-    detail::CompileProgram(this->device_, this->program_);
-    this->cleanup_kernel_dir();
-}
-
-TEST_F(CompileProgramWithKernelPathEnvVarFixture, KernelUnderMetalRootDirAndKernelRootDir) {
-    const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp";
-    this->setup_kernel_dir(kernel_file, kernel_file);
-    this->create_kernel(kernel_file);
-    detail::CompileProgram(this->device_, this->program_);
-    this->cleanup_kernel_dir();
-}
-
-TEST_F(CompileProgramWithKernelPathEnvVarFixture, NonExistentKernel) {
-    const string &kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/non_existent_kernel.cpp";
-    this->create_kernel(kernel_file);
-    EXPECT_THROW(detail::CompileProgram(this->device_, this->program_), std::exception);
-}
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp
index 1220f3e935d..0d5e2230d28 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_h.cpp
@@ -12,7 +12,7 @@ void MAIN {
     uint32_t B = get_arg_val<uint32_t>(0);
     uint32_t Ht = get_arg_val<uint32_t>(1);
     uint32_t Wt = get_arg_val<uint32_t>(2);
-    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1);
 
     for (uint32_t b = 0; b < B; b++) {
     for (uint32_t h = 0; h < Ht; h++) {
@@ -20,23 +20,23 @@ void MAIN {
         // For this bcast-h op the reader will wrap the RHS source tile around at Wt
         // so here we just linearly read 2 parallel arrays and apply bcast op per tile
         // (bcast_h propagates the op down the H dimension, so it can be though of as bcast to H)
-        cb_wait_front(tt::CB::c_in1, onetile);
+        cb_wait_front(tt::CBIndex::c_1, onetile);
 
-        cb_reserve_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
 
         acquire_dst();
 
-        cb_wait_front(tt::CB::c_in0, onetile);
+        cb_wait_front(tt::CBIndex::c_0, onetile);
 
-        BCAST_OP<BroadcastType::ROW>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
-        pack_tile(0, tt::CB::c_out0);
+        BCAST_OP<BroadcastType::ROW>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_16);
 
-        cb_pop_front(tt::CB::c_in0, onetile);
+        cb_pop_front(tt::CBIndex::c_0, onetile);
 
         release_dst();
 
-        cb_push_back(tt::CB::c_out0, onetile);
-        cb_pop_front(tt::CB::c_in1, onetile);
+        cb_push_back(tt::CBIndex::c_16, onetile);
+        cb_pop_front(tt::CBIndex::c_1, onetile);
     } } }
 }
 } // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp
index 499afa82fad..30965b343f1 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_hw.cpp
@@ -12,34 +12,34 @@ void MAIN {
     uint32_t B = get_arg_val<uint32_t>(0);
     uint32_t Ht = get_arg_val<uint32_t>(1);
     uint32_t Wt = get_arg_val<uint32_t>(2);
-    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1);
 
     #ifdef BCAST_SCALAR
-    cb_wait_front(tt::CB::c_in1, onetile);
+    cb_wait_front(tt::CBIndex::c_1, onetile);
     #endif
 
     for (uint32_t b = 0; b < B; b++) {
     for (uint32_t h = 0; h < Ht; h++) {
     for (uint32_t w = 0; w < Wt; w++) {
         #ifndef BCAST_SCALAR
-        cb_wait_front(tt::CB::c_in1, onetile);
+        cb_wait_front(tt::CBIndex::c_1, onetile);
         #endif
-        cb_reserve_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
 
         acquire_dst();
 
-        cb_wait_front(tt::CB::c_in0, onetile);
+        cb_wait_front(tt::CBIndex::c_0, onetile);
 
-        BCAST_OP<BroadcastType::SCALAR>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
-        pack_tile(0, tt::CB::c_out0);
+        BCAST_OP<BroadcastType::SCALAR>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_16);
 
-        cb_pop_front(tt::CB::c_in0, onetile);
+        cb_pop_front(tt::CBIndex::c_0, onetile);
         #ifndef BCAST_SCALAR
-        cb_pop_front(tt::CB::c_in1, onetile);
+        cb_pop_front(tt::CBIndex::c_1, onetile);
         #endif
         release_dst();
 
-        cb_push_back(tt::CB::c_out0, onetile);
+        cb_push_back(tt::CBIndex::c_16, onetile);
     } } }
 
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp
index ec6f71c0023..1f79464d180 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bcast_w.cpp
@@ -14,28 +14,28 @@ void MAIN {
     uint32_t Ht = get_arg_val<uint32_t>(1);
     uint32_t Wt = get_arg_val<uint32_t>(2);
 
-    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1);
 
     for (uint32_t b = 0; b < B; b++) {
     for (uint32_t h = 0; h < Ht; h++) {
-        cb_wait_front(tt::CB::c_in1, onetile);
+        cb_wait_front(tt::CBIndex::c_1, onetile);
         for (uint32_t w = 0; w < Wt; w++) {
 
-            cb_reserve_back(tt::CB::c_out0, onetile);
+            cb_reserve_back(tt::CBIndex::c_16, onetile);
 
             acquire_dst();
 
-            cb_wait_front(tt::CB::c_in0, onetile);
-            BCAST_OP<BroadcastType::COL>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
-            pack_tile(0, tt::CB::c_out0);
-            cb_pop_front(tt::CB::c_in0, onetile);
+            cb_wait_front(tt::CBIndex::c_0, onetile);
+            BCAST_OP<BroadcastType::COL>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+            pack_tile(0, tt::CBIndex::c_16);
+            cb_pop_front(tt::CBIndex::c_0, onetile);
 
             release_dst();
 
-            cb_push_back(tt::CB::c_out0, onetile);
+            cb_push_back(tt::CBIndex::c_16, onetile);
 
         }
-        cb_pop_front(tt::CB::c_in1, onetile);
+        cb_pop_front(tt::CBIndex::c_1, onetile);
     }}
 }
 } // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp
index d62a8e06e98..32bf8674741 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm.cpp
@@ -33,18 +33,18 @@ void MAIN {
     {
         acquire_dst();
         for (uint32_t kt = 0; kt < Kt; kt++) {
-            cb_wait_front(tt::CB::c_in0, onetile);
-            cb_wait_front(tt::CB::c_in1, onetile);
+            cb_wait_front(tt::CBIndex::c_0, onetile);
+            cb_wait_front(tt::CBIndex::c_1, onetile);
 
-            matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
+            matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
 
-            cb_pop_front(tt::CB::c_in0, onetile);
-            cb_pop_front(tt::CB::c_in1, onetile);
+            cb_pop_front(tt::CBIndex::c_0, onetile);
+            cb_pop_front(tt::CBIndex::c_1, onetile);
         }
 
-        cb_reserve_back(tt::CB::c_out0, onetile);
-        pack_tile(0, tt::CB::c_out0);
-        cb_push_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
+        pack_tile(0, tt::CBIndex::c_16);
+        cb_push_back(tt::CBIndex::c_16, onetile);
 
         release_dst();
     }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp
index 2ab808f2f32..58db4b37817 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm.cpp
@@ -34,8 +34,8 @@ void MAIN {
         {
             bool last_out = block == (num_blocks-1);
 
-            cb_wait_front(tt::CB::c_in0, in0_block_num_tiles);
-            cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+            cb_wait_front(tt::CBIndex::c_0, in0_block_num_tiles);
+            cb_wait_front(tt::CBIndex::c_1, in1_block_num_tiles);
             int in0_index_subblock_offset = 0;
             for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
                 int in1_index_subblock_offset = 0;
@@ -45,11 +45,11 @@ void MAIN {
 
                     if (enable_reload) {
                         copy_tile_to_dst_init_short();
-                        cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            copy_tile(tt::CB::c_intermed0, i, i);
+                            copy_tile(tt::CBIndex::c_24, i, i);
                         }
-                        cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
                         mm_init_short();
                     }
 
@@ -62,7 +62,7 @@ void MAIN {
                             for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
                                 int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
                                 int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
-                                matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                                 in1_index_inner_dim_offset += in1_per_core_w;
                             }
                             dst_index++;
@@ -72,23 +72,23 @@ void MAIN {
 
                     if (last_out) {
                         // Pack out to output buffer
-                        cb_reserve_back(tt::CB::c_out0, out_subblock_num_tiles);
+                        cb_reserve_back(tt::CBIndex::c_16, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            pack_tile(i, tt::CB::c_out0);
+                            pack_tile(i, tt::CBIndex::c_16);
                         }
-                        cb_push_back(tt::CB::c_out0, out_subblock_num_tiles);
+                        cb_push_back(tt::CBIndex::c_16, out_subblock_num_tiles);
                     } else {
                         // Wait for tiles in output buffer to be written out since interm and output share memory
                         if (block == 0) {
-                            cb_reserve_back(tt::CB::c_out0, out_num_tiles_to_wait);
+                            cb_reserve_back(tt::CBIndex::c_16, out_num_tiles_to_wait);
                             out_num_tiles_to_wait += out_subblock_num_tiles;
                         }
                         // Move partial result to interm buffer
-                        cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            pack_tile(i, tt::CB::c_intermed0);
+                            pack_tile(i, tt::CBIndex::c_24);
                         }
-                        cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
                     }
 
                     release_dst();
@@ -99,8 +99,8 @@ void MAIN {
 
             if (spill) enable_reload = true;
 
-            cb_pop_front(tt::CB::c_in0, in0_block_num_tiles);
-            cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+            cb_pop_front(tt::CBIndex::c_0, in0_block_num_tiles);
+            cb_pop_front(tt::CBIndex::c_1, in1_block_num_tiles);
 
         }
     }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
index 8b57508325c..173b0a8f86e 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
@@ -29,12 +29,12 @@ void MAIN {
     uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w;
     uint32_t batch = get_compile_time_arg_val(11); // batch dim
 
-    uint32_t in0_cb_id = tt::CB::c_in0;
-    uint32_t in1_cb_id = tt::CB::c_in1;
-    uint32_t out_cb_id = tt::CB::c_out0;
-    uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
-    uint32_t mm_bias_intermediate_cb_id = tt::CB::c_intermed1;
-    uint32_t bias_cb_id = tt::CB::c_in3;
+    uint32_t in0_cb_id = tt::CBIndex::c_0;
+    uint32_t in1_cb_id = tt::CBIndex::c_1;
+    uint32_t out_cb_id = tt::CBIndex::c_16;
+    uint32_t mm_partials_cb_id = tt::CBIndex::c_24;
+    uint32_t mm_bias_intermediate_cb_id = tt::CBIndex::c_25;
+    uint32_t bias_cb_id = tt::CBIndex::c_3;
 
     #ifdef FUSE_BIAS
         init_bcast<EltwiseBinaryType::ELWADD, BroadcastType::ROW>(mm_bias_intermediate_cb_id, bias_cb_id);
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp
index 632fc69018f..a97ff85a9fc 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_large_block_zm_mixed_precision.cpp
@@ -23,10 +23,10 @@ void MAIN {
     uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10); // out_subblock_h * out_subblock_w;
     uint32_t batch = get_compile_time_arg_val(11); // batch dim
 
-    uint32_t in0_cb_id = tt::CB::c_in0;
-    uint32_t in1_cb_id = tt::CB::c_in1;
-    uint32_t out_cb_id = tt::CB::c_out0;
-    uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
+    uint32_t in0_cb_id = tt::CBIndex::c_0;
+    uint32_t in1_cb_id = tt::CBIndex::c_1;
+    uint32_t out_cb_id = tt::CBIndex::c_16;
+    uint32_t mm_partials_cb_id = tt::CBIndex::c_24;
 
     mm_init(in0_cb_id, in1_cb_id, out_cb_id);
 
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp
index 09c351363fd..69d94efe242 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/bmm_tilize_untilize.cpp
@@ -129,18 +129,18 @@ void MAIN {
     bool spill = in0_num_blocks_w > 1;
 
     // CB indices
-    constexpr uint32_t in0_cb_id                                = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id                                = tt::CB::c_in1;
-    constexpr uint32_t matmul_partials_cb                       = tt::CB::c_intermed0;
-    constexpr uint32_t tilized_in0_cb_id                        = tt::CB::c_intermed1;
-    constexpr uint32_t untilize_mode_final_matmul_partials_cb   = tt::CB::c_intermed2;
-    constexpr uint32_t untilize_mode_reblock_cb                 = tt::CB::c_intermed3;
-    constexpr uint32_t out_cb_id                                = tt::CB::c_out0;
+    constexpr uint32_t in0_cb_id                                = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id                                = tt::CBIndex::c_1;
+    constexpr uint32_t matmul_partials_cb                       = tt::CBIndex::c_24;
+    constexpr uint32_t tilized_in0_cb_id                        = tt::CBIndex::c_25;
+    constexpr uint32_t untilize_mode_final_matmul_partials_cb   = tt::CBIndex::c_26;
+    constexpr uint32_t untilize_mode_reblock_cb                 = tt::CBIndex::c_27;
+    constexpr uint32_t out_cb_id                                = tt::CBIndex::c_16;
 
     #ifdef FUSE_BIAS
         uint32_t bias_ntiles_w = get_compile_time_arg_val(16);
-        constexpr uint32_t bias_cb_id                           = tt::CB::c_in2;
-        constexpr uint32_t out_for_bias_cb_id                   = tt::CB::c_intermed4;
+        constexpr uint32_t bias_cb_id                           = tt::CBIndex::c_2;
+        constexpr uint32_t out_for_bias_cb_id                   = tt::CBIndex::c_28;
         init_bcast<EltwiseBinaryType::ELWADD, BroadcastType::ROW>(out_for_bias_cb_id, bias_cb_id, out_cb_id);
     #endif
 
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/broadcast.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/broadcast.cpp
index 267be6ebc2e..dd50bfc8cf1 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/broadcast.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/broadcast.cpp
@@ -11,27 +11,27 @@ void MAIN {
     constexpr uint32_t onetile = 1;
 
     #ifndef BCAST_OP_INIT
-        init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+        init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1);
     #else
-        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
-        BCAST_OP_INIT(tt::CB::c_in0, tt::CB::c_in1);
+        binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_1);
+        BCAST_OP_INIT(tt::CBIndex::c_0, tt::CBIndex::c_1);
     #endif
 
-    cb_wait_front(tt::CB::c_in1, onetile);
-    cb_reserve_back(tt::CB::c_out0, onetile);
+    cb_wait_front(tt::CBIndex::c_1, onetile);
+    cb_reserve_back(tt::CBIndex::c_16, onetile);
     acquire_dst();
-    cb_wait_front(tt::CB::c_in0, onetile);
+    cb_wait_front(tt::CBIndex::c_0, onetile);
 
     #ifndef BCAST_SPECIFIC
-        BCAST_OP<BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
+        BCAST_OP<BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
     #else
-        BCAST_OP(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
+        BCAST_OP(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
     #endif
-    pack_tile(0, tt::CB::c_out0);
+    pack_tile(0, tt::CBIndex::c_16);
 
-    cb_pop_front(tt::CB::c_in0, onetile);
+    cb_pop_front(tt::CBIndex::c_0, onetile);
     release_dst();
-    cb_push_back(tt::CB::c_out0, onetile);
-    cb_pop_front(tt::CB::c_in1, onetile);
+    cb_push_back(tt::CBIndex::c_16, onetile);
+    cb_pop_front(tt::CBIndex::c_1, onetile);
 }
 } // NAMESPACE
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/cumsum.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/cumsum.cpp
index c72464280c5..c72a4172a70 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/cumsum.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/cumsum.cpp
@@ -20,24 +20,24 @@ void MAIN {
     constexpr uint32_t NC = get_compile_time_arg_val(2);
 
 #ifndef ROWWISE
-    init_sfpu(tt::CB::c_in0);
+    init_sfpu(tt::CBIndex::c_0, tt::CBIndex::c_16);
 #else
-    transpose_wh_init(tt::CB::c_in0);
+    transpose_wh_init(tt::CBIndex::c_0);
 #endif
     cumsum_tile_init();
 
     for (uint32_t nc = 0; nc < NC; ++nc) {
         for(uint32_t wt = 0; wt < Wt; ++wt) {
             for(uint32_t ht = 0; ht < Ht; ++ht) {
-                cb_reserve_back(tt::CB::c_out0, onetile);
+                cb_reserve_back(tt::CBIndex::c_16, onetile);
                 acquire_dst();
-                cb_wait_front(tt::CB::c_in0, onetile);
+                cb_wait_front(tt::CBIndex::c_0, onetile);
 
                 #ifndef ROWWISE
-                    copy_tile(tt::CB::c_in0, 0, 0);
+                    copy_tile(tt::CBIndex::c_0, 0, 0);
                 #else
-                    transpose_wh_init_short(tt::CB::c_in0);
-                    transpose_wh_tile(tt::CB::c_in0, 0, 0);
+                    transpose_wh_init_short(tt::CBIndex::c_0);
+                    transpose_wh_tile(tt::CBIndex::c_0, 0, 0);
                 #endif
                 cumsum_tile(0, ht == 0);
                 #ifdef ROWWISE
@@ -45,11 +45,11 @@ void MAIN {
                     transpose_wh_dest(0);
                 #endif
 
-                pack_tile(0, tt::CB::c_out0);
+                pack_tile(0, tt::CBIndex::c_16);
 
-                cb_pop_front(tt::CB::c_in0, onetile);
+                cb_pop_front(tt::CBIndex::c_0, onetile);
                 release_dst();
-                cb_push_back(tt::CB::c_out0, onetile);
+                cb_push_back(tt::CBIndex::c_16, onetile);
             }
         }
     }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/dropout_sfpu.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/dropout_sfpu.cpp
index 5f43fc0b346..55e4b67c8aa 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/dropout_sfpu.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/dropout_sfpu.cpp
@@ -16,27 +16,27 @@ void MAIN {
     uint32_t int_probability = get_compile_time_arg_val(3);
     uint32_t int_scale_factor = get_compile_time_arg_val(4);
 
-    init_sfpu(tt::CB::c_in0);
+    init_sfpu(tt::CBIndex::c_0, tt::CBIndex::c_16);
     dropout_tile_init(seed);
     for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) {
-        cb_reserve_back(tt::CB::c_out0, per_core_block_dim);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_dim);
         for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
             acquire_dst();
 
             // Pop tile after tile, copy to DST and pack
-            cb_wait_front(tt::CB::c_in0, 1);
+            cb_wait_front(tt::CBIndex::c_0, 1);
 
-            copy_tile(tt::CB::c_in0, 0, 0);
+            copy_tile(tt::CBIndex::c_0, 0, 0);
 
             dropout_tile(0, int_probability, int_scale_factor);
 
-            pack_tile(0, tt::CB::c_out0);
+            pack_tile(0, tt::CBIndex::c_16);
 
-            cb_pop_front(tt::CB::c_in0, 1);
+            cb_pop_front(tt::CBIndex::c_0, 1);
 
             release_dst();
         }
-        cb_push_back(tt::CB::c_out0, per_core_block_dim);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_dim);
     }
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/dst_untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/dst_untilize.cpp
index e613c901b28..68b205aa2fb 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/dst_untilize.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/dst_untilize.cpp
@@ -17,26 +17,26 @@ void MAIN {
     constexpr uint32_t num_faces = get_compile_time_arg_val(2);
     constexpr uint32_t num_rows_per_face = get_compile_time_arg_val(3);
 
-    unary_op_init_common(tt::CB::c_in0, tt::CB::c_out0);
-    copy_tile_to_dst_init_short(tt::CB::c_in0);
-    pack_untilize_dst_init_short<per_core_block_tile_cnt>(tt::CB::c_out0, num_rows_per_face, num_faces);
+    unary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_16);
+    copy_tile_to_dst_init_short(tt::CBIndex::c_0);
+    pack_untilize_dst_init_short<per_core_block_tile_cnt>(tt::CBIndex::c_16, num_rows_per_face, num_faces);
 
     for(uint32_t b = 0; b < per_core_block_cnt; ++ b) {
-        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
 
         tile_regs_acquire();
         for (uint32_t i = 0; i < per_core_block_tile_cnt; ++i) {
-            copy_tile(tt::CB::c_in0, i, i);
+            copy_tile(tt::CBIndex::c_0, i, i);
         }
         tile_regs_commit();
 
         tile_regs_wait();
-        pack_untilize_dst<per_core_block_tile_cnt>(tt::CB::c_out0, 1, 0, num_rows_per_face, num_faces);
+        pack_untilize_dst<per_core_block_tile_cnt>(tt::CBIndex::c_16, 1, 0, num_rows_per_face, num_faces);
         tile_regs_release();
 
-        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
     }
 
     pack_untilize_uninit();
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp
index 41e494d29b8..e8178160017 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp
@@ -12,20 +12,20 @@ namespace NAMESPACE {
 void MAIN {
     uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
 
-    unary_op_init_common(tt::CB::c_in0);
+    unary_op_init_common(tt::CBIndex::c_0);
     for(uint32_t b=0;b<per_core_tile_cnt;++b)
     {
         acquire_dst();
 
         // Pop tile after tile, copy to DST and pack
-        cb_wait_front(tt::CB::c_in0, 1);
-        cb_reserve_back(tt::CB::c_out0, 1);
-        copy_tile(tt::CB::c_in0, 0, 0);
+        cb_wait_front(tt::CBIndex::c_0, 1);
+        cb_reserve_back(tt::CBIndex::c_16, 1);
+        copy_tile(tt::CBIndex::c_0, 0, 0);
 
-        pack_tile(0, tt::CB::c_out0);
+        pack_tile(0, tt::CBIndex::c_16);
 
-        cb_pop_front(tt::CB::c_in0, 1);
-        cb_push_back(tt::CB::c_out0, 1);
+        cb_pop_front(tt::CBIndex::c_0, 1);
+        cb_push_back(tt::CBIndex::c_16, 1);
 
         release_dst();
     }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp
index d0c3d000993..52246b5a66c 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_block.cpp
@@ -15,18 +15,18 @@ void MAIN {
        acquire_dst();
 
        // Wait tiles on the input / copy to dst / pop from input
-       cb_wait_front(tt::CB::c_in0, block_num_tiles);
+       cb_wait_front(tt::CBIndex::c_0, block_num_tiles);
        for(uint32_t t = 0; t < block_num_tiles; ++t) {
-           copy_tile(tt::CB::c_in0, t, t);
+           copy_tile(tt::CBIndex::c_0, t, t);
        }
-       cb_pop_front(tt::CB::c_in0, block_num_tiles);
+       cb_pop_front(tt::CBIndex::c_0, block_num_tiles);
 
        // Reserve space in output / pack / push to output
-       cb_reserve_back(tt::CB::c_out0, block_num_tiles);
+       cb_reserve_back(tt::CBIndex::c_16, block_num_tiles);
        for(uint32_t t = 0; t < block_num_tiles; ++t) {
-            pack_tile(t, tt::CB::c_out0);
+            pack_tile(t, tt::CBIndex::c_16);
        }
-       cb_push_back(tt::CB::c_out0, block_num_tiles);
+       cb_push_back(tt::CBIndex::c_16, block_num_tiles);
 
        release_dst();
     }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_print_dest.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_print_dest.cpp
index 55bd652c190..f9ad50d964e 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_print_dest.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_print_dest.cpp
@@ -16,24 +16,24 @@ void MAIN {
     bool remap = get_compile_time_arg_val(1) != 0;
     bool swizzle = get_compile_time_arg_val(2) != 0;
 
-    unary_op_init_common(tt::CB::c_in0);
+    unary_op_init_common(tt::CBIndex::c_0);
 #ifdef ARCH_BLACKHOLE
     cfg_reg_rmw_tensix<DEST_ACCESS_CFG_remap_addrs_RMW>(remap);
     cfg_reg_rmw_tensix<DEST_ACCESS_CFG_swizzle_32b_RMW>(swizzle);
 #endif
     acquire_dst();
-    cb_wait_front(tt::CB::c_in0, per_core_tile_cnt);
-    cb_reserve_back(tt::CB::c_out0, per_core_tile_cnt);
+    cb_wait_front(tt::CBIndex::c_0, per_core_tile_cnt);
+    cb_reserve_back(tt::CBIndex::c_16, per_core_tile_cnt);
 
     for (uint32_t b = 0; b < per_core_tile_cnt; ++b) {
-        copy_tile(tt::CB::c_in0, b, b);
+        copy_tile(tt::CBIndex::c_0, b, b);
         dprint_tensix_dest_reg(b);
     }
 
     for (uint32_t b = 0; b < per_core_tile_cnt; ++b) {
-        pack_tile(b, tt::CB::c_out0);
-        cb_pop_front(tt::CB::c_in0, 1);
-        cb_push_back(tt::CB::c_out0, 1);
+        pack_tile(b, tt::CBIndex::c_16);
+        cb_pop_front(tt::CBIndex::c_0, 1);
+        cb_push_back(tt::CBIndex::c_16, 1);
     }
 
     release_dst();
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp
index 0e98deb9fc4..c23b80c7341 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/eltwise_sfpi.cpp
@@ -13,29 +13,29 @@ void MAIN {
 
     INIT_RELU
     for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) {
-        cb_reserve_back(CB::c_out0, per_core_block_dim);
+        cb_reserve_back(CBIndex::c_16, per_core_block_dim);
         for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
             acquire_dst();
 
             // Pop tile after tile, copy to DST and pack
-            cb_wait_front(CB::c_in0, 1);
+            cb_wait_front(CBIndex::c_0, 1);
 
-            copy_tile(CB::c_in0, 0, 0);
+            copy_tile(CBIndex::c_0, 0, 0);
             // SFPU_OP expected to be defined via add_define as one of
             // exp_tile, gelu_tile, recip_tile. etc followed by pack_tile
             // (except for relu because the llk is fused for relu)
-            // "sfpu_gelu(0); pack_tile(0, CB::c_out0);"
+            // "sfpu_gelu(0); pack_tile(0, CBIndex::c_16);"
 
             SFPI_OP_AND_PACK
             // comes from add_define in kernel config
-            // Also is epxected to include pack_tile(0, CB::c_out0); for non-relu
+            // Also is epxected to include pack_tile(0, CBIndex::c_16); for non-relu
             // For relu it expects the hlk_pack_relu variant
 
-            cb_pop_front(CB::c_in0, 1);
+            cb_pop_front(CBIndex::c_0, 1);
 
             release_dst();
         }
-        cb_push_back(CB::c_out0, per_core_block_dim);
+        cb_push_back(CBIndex::c_16, per_core_block_dim);
     }
     DEINIT_RELU
     // expands to hlk_relu_config(nullptr, 0); for relu only
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp
index a0df80a2e9e..edc9dab2312 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/layernorm.cpp
@@ -31,9 +31,9 @@ void MAIN {
 
 
     #ifdef FUSE_PRE_ADD
-        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
+        binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_1);
     #else
-        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+        binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0);
     #endif
 
     constexpr uint32_t onetile = 1;
@@ -41,31 +41,31 @@ void MAIN {
     // TODO(AP): check that if DST is indeed zeroed by release_dst (and initially), we can use it as zeroes
 
     // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct
-    constexpr auto cb_scaler = tt::CB::c_in2; // single tile generated by the reader
-    constexpr auto cb_eps = tt::CB::c_in3; // single tile generated by the reader
-    constexpr auto cb_xmm = tt::CB::c_intermed0; // x minus mean, this is a large buffer, see setup code in layernorm_op.cpp
-    constexpr auto cb_ex = tt::CB::c_intermed1; // E[x]
-    constexpr auto cb_ex2 = tt::CB::c_intermed2; // E[(x-E[x])^2]
-    constexpr auto cb_xmm2 = tt::CB::c_intermed3; // xmm^2
-    constexpr auto cb_ex2pe = tt::CB::c_intermed4; // E[(x-E[x])^2]+eps
-    constexpr auto cb_in = tt::CB::c_in0; // input x or a for fused pre-add (x=a+b)
-    constexpr auto cb_inb = tt::CB::c_in1; // input b for fused pre-add
-    constexpr auto cb_out = tt::CB::c_out0; // output
-    constexpr auto cb_gamma = tt::CB::c_in5;
-    constexpr auto cb_beta = tt::CB::c_in6;
-    constexpr auto cb_fusion = tt::CB::c_intermed5; // stream gamma/beta
+    constexpr auto cb_scaler = tt::CBIndex::c_2; // single tile generated by the reader
+    constexpr auto cb_eps = tt::CBIndex::c_3; // single tile generated by the reader
+    constexpr auto cb_xmm = tt::CBIndex::c_24; // x minus mean, this is a large buffer, see setup code in layernorm_op.cpp
+    constexpr auto cb_ex = tt::CBIndex::c_25; // E[x]
+    constexpr auto cb_ex2 = tt::CBIndex::c_26; // E[(x-E[x])^2]
+    constexpr auto cb_xmm2 = tt::CBIndex::c_27; // xmm^2
+    constexpr auto cb_ex2pe = tt::CBIndex::c_28; // E[(x-E[x])^2]+eps
+    constexpr auto cb_in = tt::CBIndex::c_0; // input x or a for fused pre-add (x=a+b)
+    constexpr auto cb_inb = tt::CBIndex::c_1; // input b for fused pre-add
+    constexpr auto cb_out = tt::CBIndex::c_16; // output
+    constexpr auto cb_gamma = tt::CBIndex::c_5;
+    constexpr auto cb_beta = tt::CBIndex::c_6;
+    constexpr auto cb_fusion = tt::CBIndex::c_29; // stream gamma/beta
     constexpr auto scaler0 = 0;
     #ifdef FUSE_PRE_ADD
-    constexpr auto cb_x = tt::CB::c_intermed6;
+    constexpr auto cb_x = tt::CBIndex::c_30;
     #else
-    constexpr auto cb_x = tt::CB::c_in0;
+    constexpr auto cb_x = tt::CBIndex::c_0;
     #endif
 
     cb_wait_front(cb_scaler, 1); // comes from the reader
     cb_wait_front(cb_eps, 1); // comes from the reader
 
 
-    constexpr int cb_im_or_out = (do_gamma|do_beta) ? cb_fusion : tt::CB::c_out0;
+    constexpr int cb_im_or_out = (do_gamma|do_beta) ? cb_fusion : tt::CBIndex::c_16;
 
 
     for (uint32_t ncht = 0; ncht < NCHt; ncht++) {
@@ -218,7 +218,7 @@ void MAIN {
 
             if (do_gamma) {
                 ACQ();
-                uint32_t cb_outg = do_beta ? cb_fusion : tt::CB::c_out0;
+                uint32_t cb_outg = do_beta ? cb_fusion : tt::CBIndex::c_16;
                 mul_bcast_rows_init_short();
                 cb_reserve_back(cb_outg, blk);
                 cb_wait_front(cb_gamma, wt+blk); // we don't pop, TODO: only wait on first ht
@@ -236,16 +236,16 @@ void MAIN {
             if (do_beta) {
                 ACQ();
                 add_bcast_rows_init_short();
-                cb_reserve_back(tt::CB::c_out0, blk);
+                cb_reserve_back(tt::CBIndex::c_16, blk);
                 cb_wait_front(cb_beta, wt+blk); // TODO: optimization - only wait on first ht
                 cb_wait_front(cb_fusion, blk);
                 for (uint32_t wtr = 0; wtr < blk; wtr++) {
                     add_tiles_bcast_rows(cb_fusion, cb_beta, wtr, wt+wtr, wtr); // tile *= 1/(sum(exp(x)))
-                    pack_tile(wtr, tt::CB::c_out0); // pack either to intermediate (cb_fusion or out0)
+                    pack_tile(wtr, tt::CBIndex::c_16); // pack either to intermediate (cb_fusion or out0)
                 }
                 cb_pop_front(cb_fusion, blk);
                 // We don't pop beta since it's 1,1,1,Wt and we reuse it for all NCHt
-                cb_push_back(tt::CB::c_out0, blk);
+                cb_push_back(tt::CBIndex::c_16, blk);
                 REL();
             }
         }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp
index 47b719c700b..66cf049ad4e 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul.cpp
@@ -23,8 +23,8 @@ void MAIN {
     acquire_dst();
     for(uint32_t b=0;b<block_cnt;++b)
     {
-        cb_wait_front(tt::CB::c_in0, in0_block_tile_cnt);
-        cb_wait_front(tt::CB::c_in1, in1_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, in0_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_1, in1_block_tile_cnt);
         int dst_tile_index = 0;
         int in0_block_tile_index = 0;
         for(uint32_t r=0;r<dst_tile_rows;++r)
@@ -34,25 +34,25 @@ void MAIN {
                 int in1_block_tile_index = 0;
                 for(uint32_t i=0;i<block_tile_dim;++i)
                 {
-                    matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_block_tile_index+i, in1_block_tile_index+c, dst_tile_index, false);
+                    matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_block_tile_index+i, in1_block_tile_index+c, dst_tile_index, false);
                     in1_block_tile_index += dst_tile_cols;
                 }
                 dst_tile_index++;
             }
             in0_block_tile_index += block_tile_dim;
         }
-        cb_pop_front(tt::CB::c_in0, in0_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in1, in1_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, in0_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_1, in1_block_tile_cnt);
     }
 
     // Pack out
-    cb_reserve_back(tt::CB::c_out0, out_block_tile_cnt);
+    cb_reserve_back(tt::CBIndex::c_16, out_block_tile_cnt);
     for(uint32_t i=0 ; i<out_block_tile_cnt;++i)
     {
-        pack_tile(i, tt::CB::c_out0);
+        pack_tile(i, tt::CBIndex::c_16);
     }
 
-    cb_push_back(tt::CB::c_out0, out_block_tile_cnt);
+    cb_push_back(tt::CBIndex::c_16, out_block_tile_cnt);
 
     release_dst();
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp
index 0f7da3d8fc7..c2236c6273b 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_block.cpp
@@ -21,9 +21,9 @@ void MAIN {
 #if (WITH_DT == 1)
     // Intentionally wrong init with different data formats
     mm_block_init(
-        tt::CB::c_in0,
-        tt::CB::c_in2,
-        tt::CB::c_out0,
+        tt::CBIndex::c_0,
+        tt::CBIndex::c_2,
+        tt::CBIndex::c_16,
         false,
         dst_tile_cols - 1,
         dst_tile_rows - 1,
@@ -31,9 +31,9 @@ void MAIN {
     );
     // Corrected init short with dt
     mm_block_init_short_with_dt(
-        tt::CB::c_in0,
-        tt::CB::c_in1,
-        tt::CB::c_in2,
+        tt::CBIndex::c_0,
+        tt::CBIndex::c_1,
+        tt::CBIndex::c_2,
         false,
         dst_tile_cols,
         dst_tile_rows,
@@ -42,9 +42,9 @@ void MAIN {
 #elif (WITH_DT == 0)
     // Intentionally wrong init with same data formats
     mm_block_init(
-        tt::CB::c_in1,
-        tt::CB::c_in0,
-        tt::CB::c_out0,
+        tt::CBIndex::c_1,
+        tt::CBIndex::c_0,
+        tt::CBIndex::c_16,
         false,
         dst_tile_cols - 1,
         dst_tile_rows - 1,
@@ -52,8 +52,8 @@ void MAIN {
     );
     // Corrected init short
     mm_block_init_short(
-        tt::CB::c_in0,
-        tt::CB::c_in1,
+        tt::CBIndex::c_0,
+        tt::CBIndex::c_1,
         false,
         dst_tile_cols,
         dst_tile_rows,
@@ -62,9 +62,9 @@ void MAIN {
 #endif
 #elif (TEST_INIT_SHORT == 0)
         mm_block_init(
-        tt::CB::c_in0,
-        tt::CB::c_in1,
-        tt::CB::c_out0,
+        tt::CBIndex::c_0,
+        tt::CBIndex::c_1,
+        tt::CBIndex::c_16,
         false,
         dst_tile_cols,
         dst_tile_rows,
@@ -75,12 +75,12 @@ void MAIN {
     acquire_dst();
     for(uint32_t b=0;b<block_cnt;++b)
     {
-        cb_wait_front(tt::CB::c_in0, in0_block_tile_cnt);
-        cb_wait_front(tt::CB::c_in1, in1_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, in0_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_1, in1_block_tile_cnt);
 
         matmul_block(
-            tt::CB::c_in0,
-            tt::CB::c_in1,
+            tt::CBIndex::c_0,
+            tt::CBIndex::c_1,
             0,
             0,
             0,
@@ -90,17 +90,17 @@ void MAIN {
             block_tile_dim
         );
 
-        cb_pop_front(tt::CB::c_in0, in0_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in1, in1_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, in0_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_1, in1_block_tile_cnt);
     }
 
     // Pack out
-    cb_reserve_back(tt::CB::c_out0, out_block_tile_cnt);
+    cb_reserve_back(tt::CBIndex::c_16, out_block_tile_cnt);
     for(uint32_t i=0 ; i<out_block_tile_cnt;++i)
     {
-        pack_tile(i, tt::CB::c_out0);
+        pack_tile(i, tt::CBIndex::c_16);
     }
-    cb_push_back(tt::CB::c_out0, out_block_tile_cnt);
+    cb_push_back(tt::CBIndex::c_16, out_block_tile_cnt);
 
     release_dst();
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp
index f9cbd1e50ef..f064e0957cd 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block.cpp
@@ -131,12 +131,12 @@ void MAIN {
     // interm3:
     //   if under untilize mode, this is the CB we write to so that we can
     //   reblock the output
-    uint32_t in0_cb                                   = tt::CB::c_in0;
-    uint32_t tilize_mode_tilized_in0_cb               = tt::CB::c_intermed0;
-    uint32_t matmul_partials_cb                       = tt::CB::c_intermed1;
-    uint32_t untilize_mode_final_matmul_partials_cb   = tt::CB::c_intermed2;
-    uint32_t untilize_mode_reblock_cb                 = tt::CB::c_intermed3;
-    uint32_t out0_cb                                  = tt::CB::c_out0;
+    uint32_t in0_cb                                   = tt::CBIndex::c_0;
+    uint32_t tilize_mode_tilized_in0_cb               = tt::CBIndex::c_24;
+    uint32_t matmul_partials_cb                       = tt::CBIndex::c_25;
+    uint32_t untilize_mode_final_matmul_partials_cb   = tt::CBIndex::c_26;
+    uint32_t untilize_mode_reblock_cb                 = tt::CBIndex::c_27;
+    uint32_t out0_cb                                  = tt::CBIndex::c_16;
 
     mm_init();
     for(uint32_t block = 0; block < num_blocks; block++)
@@ -150,7 +150,7 @@ void MAIN {
             cb_wait_front(in0_cb, in0_block_num_tiles);
         }
 
-        cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+        cb_wait_front(tt::CBIndex::c_1, in1_block_num_tiles);
         int in0_index_subblock_offset = 0;
         for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
             int in1_index_subblock_offset = 0;
@@ -178,9 +178,9 @@ void MAIN {
                             int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
                             int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
                             if  (tilize_in) {
-                                matmul_tiles(tilize_mode_tilized_in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                matmul_tiles(tilize_mode_tilized_in0_cb, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                             } else {
-                                matmul_tiles(in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                matmul_tiles(in0_cb, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                             }
                             in1_index_inner_dim_offset += in1_per_core_w;
                         }
@@ -230,7 +230,7 @@ void MAIN {
         } else {
             cb_pop_front(in0_cb, in0_block_num_tiles);
         }
-        cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+        cb_pop_front(tt::CBIndex::c_1, in1_block_num_tiles);
     }
 
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp
index 06a070fcb70..9f4bc657e1d 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_generalized.cpp
@@ -130,12 +130,12 @@ void MAIN {
     // interm3:
     //   if under untilize mode, this is the CB we write to so that we can
     //   reblock the output
-    uint32_t in0_cb                                   = tt::CB::c_in0;
-    uint32_t tilize_mode_tilized_in0_cb               = tt::CB::c_intermed0;
-    uint32_t matmul_partials_cb                       = tt::CB::c_intermed1;
-    uint32_t untilize_mode_final_matmul_partials_cb   = tt::CB::c_intermed2;
-    uint32_t untilize_mode_reblock_cb                 = tt::CB::c_intermed3;
-    uint32_t out0_cb                                  = tt::CB::c_out0;
+    uint32_t in0_cb                                   = tt::CBIndex::c_0;
+    uint32_t tilize_mode_tilized_in0_cb               = tt::CBIndex::c_24;
+    uint32_t matmul_partials_cb                       = tt::CBIndex::c_25;
+    uint32_t untilize_mode_final_matmul_partials_cb   = tt::CBIndex::c_26;
+    uint32_t untilize_mode_reblock_cb                 = tt::CBIndex::c_27;
+    uint32_t out0_cb                                  = tt::CBIndex::c_16;
     mm_init();
     for(uint32_t block_in0_h = 0; block_in0_h < num_blocks_in0_h; block_in0_h++) {
         for(uint32_t block_in1_w = 0; block_in1_w < num_blocks_in1_w; block_in1_w++) {
@@ -154,7 +154,7 @@ void MAIN {
                     cb_wait_front(in0_cb, in0_block_num_tiles);
                 }
 
-                cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+                cb_wait_front(tt::CBIndex::c_1, in1_block_num_tiles);
 
                 int in0_index_subblock_offset = 0;
                 for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
@@ -183,9 +183,9 @@ void MAIN {
                                     int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
                                     int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
                                     if  (tilize_in) {
-                                        matmul_tiles(tilize_mode_tilized_in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                        matmul_tiles(tilize_mode_tilized_in0_cb, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                                     } else {
-                                        matmul_tiles(in0_cb, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                        matmul_tiles(in0_cb, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                                     }
                                     in1_index_inner_dim_offset += in1_per_core_w;
                                 }
@@ -234,7 +234,7 @@ void MAIN {
                 } else {
                     cb_pop_front(in0_cb, in0_block_num_tiles);
                 }
-                cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+                cb_pop_front(tt::CBIndex::c_1, in1_block_num_tiles);
             }
         }
 
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp
index 72ace7d364c..f4af53542ce 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_large_block_zm.cpp
@@ -32,8 +32,8 @@ void MAIN {
     {
         bool last_out = block == (num_blocks-1);
 
-        cb_wait_front(tt::CB::c_in0, in0_block_num_tiles);
-        cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+        cb_wait_front(tt::CBIndex::c_0, in0_block_num_tiles);
+        cb_wait_front(tt::CBIndex::c_1, in1_block_num_tiles);
         int in0_index_subblock_offset = 0;
         for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
             int in1_index_subblock_offset = 0;
@@ -43,11 +43,11 @@ void MAIN {
 
                 if (enable_reload) {
                     copy_tile_to_dst_init_short();
-                    cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                    cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
                     for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                        copy_tile(tt::CB::c_intermed0, i, i);
+                        copy_tile(tt::CBIndex::c_24, i, i);
                     }
-                    cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                    cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
                     mm_init_short();
                 }
 
@@ -60,7 +60,7 @@ void MAIN {
                         for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
                             int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
                             int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
-                            matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                            matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                             in1_index_inner_dim_offset += in1_per_core_w;
                         }
                         dst_index++;
@@ -70,18 +70,18 @@ void MAIN {
 
                 if (last_out) {
                     // Pack out to output buffer
-                    cb_reserve_back(tt::CB::c_out0, out_subblock_num_tiles);
+                    cb_reserve_back(tt::CBIndex::c_16, out_subblock_num_tiles);
                     for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                        pack_tile(i, tt::CB::c_out0);
+                        pack_tile(i, tt::CBIndex::c_16);
                     }
-                    cb_push_back(tt::CB::c_out0, out_subblock_num_tiles);
+                    cb_push_back(tt::CBIndex::c_16, out_subblock_num_tiles);
                 } else {
                     // Move partial result to interm buffer
-                    cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                    cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
                     for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                        pack_tile(i, tt::CB::c_intermed0);
+                        pack_tile(i, tt::CBIndex::c_24);
                     }
-                    cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                    cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
                 }
 
                 release_dst();
@@ -92,8 +92,8 @@ void MAIN {
 
         if (spill) enable_reload = true;
 
-        cb_pop_front(tt::CB::c_in0, in0_block_num_tiles);
-        cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+        cb_pop_front(tt::CBIndex::c_0, in0_block_num_tiles);
+        cb_pop_front(tt::CBIndex::c_1, in1_block_num_tiles);
 
     }
 
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp
index fb6bfa44e5d..133b5051d87 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/matmul_with_bias.cpp
@@ -29,8 +29,8 @@ void MAIN {
     mm_init();
     for(uint32_t b=0;b<block_cnt;++b)
     {
-        cb_wait_front(tt::CB::c_in0, in0_block_tile_cnt);
-        cb_wait_front(tt::CB::c_in1, in1_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, in0_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_1, in1_block_tile_cnt);
         int dst_tile_index = 0;
         int in0_block_tile_index = 0;
         for(uint32_t r=0;r<dst_tile_rows;++r)
@@ -40,34 +40,34 @@ void MAIN {
                 int in1_block_tile_index = 0;
                 for(uint32_t i=0;i<block_tile_dim;++i)
                 {
-                    matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_block_tile_index+i, in1_block_tile_index+c, dst_tile_index, false);
+                    matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_block_tile_index+i, in1_block_tile_index+c, dst_tile_index, false);
                     in1_block_tile_index += dst_tile_cols;
                 }
                 dst_tile_index++;
             }
             in0_block_tile_index += block_tile_dim;
         }
-        cb_pop_front(tt::CB::c_in0, in0_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in1, in1_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, in0_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_1, in1_block_tile_cnt);
     }
 
 
     // add bias in2 to intermed0 and load to dst
     if (with_bias) {
         // Pack out
-        cb_reserve_back(tt::CB::c_intermed0, out_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_24, out_block_tile_cnt);
         for(uint32_t i=0 ; i<out_block_tile_cnt;++i)
         {
-            pack_tile(i, tt::CB::c_intermed0);
+            pack_tile(i, tt::CBIndex::c_24);
         }
-        cb_push_back(tt::CB::c_intermed0, out_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_24, out_block_tile_cnt);
         release_dst();
 
         acquire_dst();
 
         add_bcast_rows_init_short();
-        cb_wait_front(tt::CB::c_intermed0, out_block_tile_cnt);
-        cb_wait_front(tt::CB::c_in2, dst_tile_cols);
+        cb_wait_front(tt::CBIndex::c_24, out_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_2, dst_tile_cols);
         int dst_tile_index = 0;
         for(uint32_t r=0;r<dst_tile_rows;++r)
         {
@@ -77,17 +77,17 @@ void MAIN {
                 dst_tile_index++;
             }
         }
-        cb_pop_front(tt::CB::c_in2, dst_tile_cols);
+        cb_pop_front(tt::CBIndex::c_2, dst_tile_cols);
     }
 
     // Pack to c_out0
-    cb_reserve_back(tt::CB::c_out0, out_block_tile_cnt);
+    cb_reserve_back(tt::CBIndex::c_16, out_block_tile_cnt);
     for(uint32_t i=0;i<out_block_tile_cnt;++i)
     {
-        pack_tile(i, tt::CB::c_out0);
+        pack_tile(i, tt::CBIndex::c_16);
     }
 
-    cb_push_back(tt::CB::c_out0, out_block_tile_cnt);
+    cb_push_back(tt::CBIndex::c_16, out_block_tile_cnt);
     release_dst();
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/max_pool.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool.cpp
index 758781f2b29..b00e5d07433 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/max_pool.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool.cpp
@@ -93,10 +93,10 @@ inline void reduce_h(uint32_t out_nelems,
 namespace NAMESPACE {
 
 void MAIN {
-    constexpr uint32_t in_cb_id = tt::CB::c_in0;
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in1;
-    constexpr uint32_t in_tiled_cb_id = tt::CB::c_intermed0;
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
+    constexpr uint32_t in_cb_id = tt::CBIndex::c_0;
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_1;
+    constexpr uint32_t in_tiled_cb_id = tt::CBIndex::c_24;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
 
     const uint32_t in_ntiles_hw = get_compile_time_arg_val(0);
     const uint32_t in_ntiles_c = get_compile_time_arg_val(1);
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp
index cbb68607424..e1c4ede103d 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/max_pool_multi_core.cpp
@@ -93,10 +93,10 @@ inline void reduce_h(uint32_t out_nelems,
 namespace NAMESPACE {
 
 void MAIN {
-    constexpr uint32_t in_cb_id = tt::CB::c_in0;
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in1;
-    constexpr uint32_t in_tiled_cb_id = tt::CB::c_intermed0;
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
+    constexpr uint32_t in_cb_id = tt::CBIndex::c_0;
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_1;
+    constexpr uint32_t in_tiled_cb_id = tt::CBIndex::c_24;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
 
     const uint32_t in_ntiles_hw = get_compile_time_arg_val(0);
     const uint32_t in_ntiles_c = get_compile_time_arg_val(1);
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/pack_untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/pack_untilize.cpp
index 1b6e08e5c40..6852b700615 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/pack_untilize.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/pack_untilize.cpp
@@ -15,20 +15,20 @@ void MAIN {
     constexpr uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
 
 #ifdef SHORT_INIT
-    unary_op_init_common(tt::CB::c_in0, tt::CB::c_out0);
-    pack_untilize_init_short<per_core_block_tile_cnt>(tt::CB::c_in0, tt::CB::c_out0);
+    unary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_16);
+    pack_untilize_init_short<per_core_block_tile_cnt>(tt::CBIndex::c_0, tt::CBIndex::c_16);
 #else
-    pack_untilize_init<per_core_block_tile_cnt>(tt::CB::c_in0, tt::CB::c_out0);
+    pack_untilize_init<per_core_block_tile_cnt>(tt::CBIndex::c_0, tt::CBIndex::c_16);
 #endif
 
     for(uint32_t b = 0; b < per_core_block_cnt; ++ b) {
-        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
 
-        pack_untilize_block<per_core_block_tile_cnt>(tt::CB::c_in0, 1, tt::CB::c_out0);
+        pack_untilize_block<per_core_block_tile_cnt>(tt::CBIndex::c_0, 1, tt::CBIndex::c_16);
 
-        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
     }
 
     pack_untilize_uninit();
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp
index c3706439495..d508a17648f 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reconfig.cpp
@@ -15,11 +15,11 @@ void MAIN {
     uint32_t num_tiles = get_arg_val<uint32_t>(0);
     uint32_t ublock_size_tiles = get_arg_val<uint32_t>(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0; // Bfp8_b
-    constexpr auto cb_in1 = tt::CB::c_in1; // Bfp16_b
-    constexpr auto cb_in2 = tt::CB::c_in2; // Bfp16_b
-    constexpr auto cb_out0 = tt::CB::c_out0; // Fp32
-    constexpr auto cb_out1 = tt::CB::c_out1; // Bfp8_b
+    constexpr auto cb_in0 = tt::CBIndex::c_0; // Bfp8_b
+    constexpr auto cb_in1 = tt::CBIndex::c_1; // Bfp16_b
+    constexpr auto cb_in2 = tt::CBIndex::c_2; // Bfp16_b
+    constexpr auto cb_out0 = tt::CBIndex::c_16; // Fp32
+    constexpr auto cb_out1 = tt::CBIndex::c_17; // Bfp8_b
 
 
     binary_op_init_common(cb_in0, cb_in1, cb_out0);
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
index 1f37badba4d..8eeec54140c 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_h.cpp
@@ -46,14 +46,14 @@ void MAIN {
     constexpr uint32_t Wt = get_compile_time_arg_val(1);
     constexpr uint32_t NC = get_compile_time_arg_val(2);
     constexpr bool at_start = get_compile_time_arg_val(3);
-    dummy_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    dummy_init<at_start>(tt::CBIndex::c_0, tt::CBIndex::c_2);
 #ifndef SHORT_INIT
-    reduce_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init<at_start>(tt::CBIndex::c_0, tt::CBIndex::c_2);
 #else
-    reduce_init_delta<at_start>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init_delta<at_start>(tt::CBIndex::c_16, tt::CBIndex::c_0, tt::CBIndex::c_2);
 #endif
 
-    cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
+    cb_wait_front(tt::CBIndex::c_2, 1); // scaler tile from the reader
     for (uint32_t nc = 0; nc < NC; nc++) {
 
         constexpr int onetile = 1;
@@ -64,26 +64,26 @@ void MAIN {
             // in this case we just sequentially add to accumulator all the H-tiles in a column
             acquire_dst();
             for(uint32_t ht = 0; ht < Ht; ++ht) {
-                cb_wait_front(tt::CB::c_in0, onetile);
+                cb_wait_front(tt::CBIndex::c_0, onetile);
 #if (MATH_ONLY == 1)
-                UNPACK(( llk_unpack_AB(tt::CB::c_in0, tt::CB::c_in2, 0, 0) ));
+                UNPACK(( llk_unpack_AB(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0) ));
                 // REDUCE_OP is expected to come from add_define
                 reduce_tile_math(reduce_dst_idx);
 #elif (MATH_ONLY == 0)
                 // REDUCE_OP is expected to come from add_define
-                reduce_tile(tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+                reduce_tile(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0, reduce_dst_idx);
 #endif
-                cb_pop_front(tt::CB::c_in0, onetile);
+                cb_pop_front(tt::CBIndex::c_0, onetile);
             }
 
-            cb_reserve_back(tt::CB::c_out0, onetile);
-            pack_tile(reduce_dst_idx, tt::CB::c_out0);
-            cb_push_back(tt::CB::c_out0, onetile);
+            cb_reserve_back(tt::CBIndex::c_16, onetile);
+            pack_tile(reduce_dst_idx, tt::CBIndex::c_16);
+            cb_push_back(tt::CBIndex::c_16, onetile);
             release_dst();
         }
     }
 #ifdef SHORT_INIT
-    reduce_revert_delta(tt::CB::c_out0);
+    reduce_revert_delta(tt::CBIndex::c_16);
 #endif
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
index 868dbcce3d9..9446c5ace09 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_hw.cpp
@@ -46,14 +46,14 @@ void MAIN {
     constexpr uint32_t Wt = get_compile_time_arg_val(1);
     constexpr uint32_t NC = get_compile_time_arg_val(2);
     constexpr bool at_start = get_compile_time_arg_val(3);
-    dummy_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    dummy_init<at_start>(tt::CBIndex::c_0, tt::CBIndex::c_2);
 #ifndef SHORT_INIT
-    reduce_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init<at_start>(tt::CBIndex::c_0, tt::CBIndex::c_2);
 #else
-    reduce_init_delta<at_start>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init_delta<at_start>(tt::CBIndex::c_16, tt::CBIndex::c_0, tt::CBIndex::c_2);
 #endif
 
-    cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
+    cb_wait_front(tt::CBIndex::c_2, 1); // scaler tile from the reader
     for (uint32_t nc = 0; nc < NC; nc++) {
 
         constexpr int onetile = 1;
@@ -64,25 +64,25 @@ void MAIN {
             // reducing in W means out[h][0] = sum(w=0..W-1, in[h][w])
             // in this case we just sequentially add to accumulator all the W-tiles in a row
             for(uint32_t wt = 0; wt < Wt; ++wt) {
-                cb_wait_front(tt::CB::c_in0, onetile);
+                cb_wait_front(tt::CBIndex::c_0, onetile);
 #if (MATH_ONLY == 1)
-                UNPACK(( llk_unpack_AB(tt::CB::c_in0, tt::CB::c_in2, 0, 0) ));
+                UNPACK(( llk_unpack_AB(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0) ));
                 // REDUCE_OP is expected to come from add_define
                 reduce_tile_math(reduce_dst_idx);
 #elif (MATH_ONLY == 0)
                 // REDUCE_OP is expected to come from add_define
-                reduce_tile(tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+                reduce_tile(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0, reduce_dst_idx);
 #endif
-                cb_pop_front(tt::CB::c_in0, onetile);
+                cb_pop_front(tt::CBIndex::c_0, onetile);
             }
         }
-        cb_reserve_back(tt::CB::c_out0, onetile);
-        pack_tile(reduce_dst_idx, tt::CB::c_out0);
-        cb_push_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
+        pack_tile(reduce_dst_idx, tt::CBIndex::c_16);
+        cb_push_back(tt::CBIndex::c_16, onetile);
         release_dst();
     }
 #ifdef SHORT_INIT
-    reduce_revert_delta(tt::CB::c_out0);
+    reduce_revert_delta(tt::CBIndex::c_16);
 #endif
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp
index 47bca05e080..7f44d24cd24 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/reduce_w.cpp
@@ -46,14 +46,14 @@ void MAIN {
     constexpr uint32_t Wt = get_compile_time_arg_val(1);
     constexpr uint32_t NC = get_compile_time_arg_val(2);
     constexpr bool at_start = get_compile_time_arg_val(3);
-    dummy_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    dummy_init<at_start>(tt::CBIndex::c_0, tt::CBIndex::c_2);
 #ifndef SHORT_INIT
-    reduce_init<at_start>(tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init<at_start>(tt::CBIndex::c_0, tt::CBIndex::c_2);
 #else
-    reduce_init_delta<at_start>(tt::CB::c_out0, tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init_delta<at_start>(tt::CBIndex::c_16, tt::CBIndex::c_0, tt::CBIndex::c_2);
 #endif
 
-    cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
+    cb_wait_front(tt::CBIndex::c_2, 1); // scaler tile from the reader
     for (uint32_t nc = 0; nc < NC; nc++) {
 
         constexpr int onetile = 1;
@@ -64,26 +64,26 @@ void MAIN {
             // in this case we just sequentially add to accumulator all the W-tiles in a row
             acquire_dst();
             for(uint32_t wt = 0; wt < Wt; ++wt) {
-                cb_wait_front(tt::CB::c_in0, onetile);
+                cb_wait_front(tt::CBIndex::c_0, onetile);
 #if (MATH_ONLY == 1)
-                UNPACK(( llk_unpack_AB(tt::CB::c_in0, tt::CB::c_in2, 0, 0) ));
+                UNPACK(( llk_unpack_AB(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0) ));
                 // REDUCE_OP is expected to come from add_define
                 reduce_tile_math(reduce_dst_idx);
 #elif (MATH_ONLY == 0)
                 // REDUCE_OP is expected to come from add_define
-                reduce_tile(tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+                reduce_tile(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0, reduce_dst_idx);
 #endif
-                cb_pop_front(tt::CB::c_in0, onetile);
+                cb_pop_front(tt::CBIndex::c_0, onetile);
             }
 
-            cb_reserve_back(tt::CB::c_out0, onetile);
-            pack_tile(reduce_dst_idx, tt::CB::c_out0);
-            cb_push_back(tt::CB::c_out0, onetile);
+            cb_reserve_back(tt::CBIndex::c_16, onetile);
+            pack_tile(reduce_dst_idx, tt::CBIndex::c_16);
+            cb_push_back(tt::CBIndex::c_16, onetile);
             release_dst();
         }
     }
 #ifdef SHORT_INIT
-    reduce_revert_delta(tt::CB::c_out0);
+    reduce_revert_delta(tt::CBIndex::c_16);
 #endif
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp
index 01821689f86..1f915fcd4bb 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/rmsnorm.cpp
@@ -30,9 +30,9 @@ void MAIN {
 
 
     #ifdef FUSE_PRE_ADD
-        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
+        binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_1);
     #else
-        binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+        binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0);
     #endif
 
     constexpr uint32_t onetile = 1;
@@ -40,30 +40,30 @@ void MAIN {
     // TODO(AP): check that if DST is indeed zeroed by release_dst (and initially), we can use it as zeroes
 
     // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct
-    constexpr auto cb_scaler = tt::CB::c_in2; // single tile generated by the reader
-    constexpr auto cb_eps = tt::CB::c_in3; // single tile generated by the reader
-    constexpr auto cb_ex = tt::CB::c_intermed1; // E[x]
-    constexpr auto cb_ex2 = tt::CB::c_intermed2; // E[(x-E[x])^2]
-    constexpr auto cb_x2 = tt::CB::c_intermed3; // x^2
-    constexpr auto cb_ex2pe = tt::CB::c_intermed4; // E[(x-E[x])^2]+eps
-    constexpr auto cb_in = tt::CB::c_in0; // input x or a for fused pre-add (x=a+b)
-    constexpr auto cb_inb = tt::CB::c_in1; // input b for fused pre-add
-    constexpr auto cb_out = tt::CB::c_out0; // output
-    constexpr auto cb_gamma = tt::CB::c_in5;
-    constexpr auto cb_beta = tt::CB::c_in6;
-    constexpr auto cb_fusion = tt::CB::c_intermed5; // stream gamma/beta
+    constexpr auto cb_scaler = tt::CBIndex::c_2; // single tile generated by the reader
+    constexpr auto cb_eps = tt::CBIndex::c_3; // single tile generated by the reader
+    constexpr auto cb_ex = tt::CBIndex::c_25; // E[x]
+    constexpr auto cb_ex2 = tt::CBIndex::c_26; // E[(x-E[x])^2]
+    constexpr auto cb_x2 = tt::CBIndex::c_27; // x^2
+    constexpr auto cb_ex2pe = tt::CBIndex::c_28; // E[(x-E[x])^2]+eps
+    constexpr auto cb_in = tt::CBIndex::c_0; // input x or a for fused pre-add (x=a+b)
+    constexpr auto cb_inb = tt::CBIndex::c_1; // input b for fused pre-add
+    constexpr auto cb_out = tt::CBIndex::c_16; // output
+    constexpr auto cb_gamma = tt::CBIndex::c_5;
+    constexpr auto cb_beta = tt::CBIndex::c_6;
+    constexpr auto cb_fusion = tt::CBIndex::c_29; // stream gamma/beta
     constexpr auto scaler0 = 0;
     #ifdef FUSE_PRE_ADD
-    constexpr auto cb_x = tt::CB::c_intermed6;
+    constexpr auto cb_x = tt::CBIndex::c_30;
     #else
-    constexpr auto cb_x = tt::CB::c_in0;
+    constexpr auto cb_x = tt::CBIndex::c_0;
     #endif
 
     cb_wait_front(cb_scaler, 1); // comes from the reader
     cb_wait_front(cb_eps, 1); // comes from the reader
 
 
-    constexpr int cb_im_or_out = (do_gamma|do_beta) ? cb_fusion : tt::CB::c_out0;
+    constexpr int cb_im_or_out = (do_gamma|do_beta) ? cb_fusion : tt::CBIndex::c_16;
 
 
     for (uint32_t ncht = 0; ncht < NCHt; ncht++) {
@@ -175,7 +175,7 @@ void MAIN {
 
             if (do_gamma) {
                 ACQ();
-                uint32_t cb_outg = do_beta ? cb_fusion : tt::CB::c_out0;
+                uint32_t cb_outg = do_beta ? cb_fusion : tt::CBIndex::c_16;
                 mul_bcast_rows_init_short();
                 cb_reserve_back(cb_outg, blk);
                 cb_wait_front(cb_gamma, wt+blk); // we don't pop, TODO: only wait on first ht
@@ -193,16 +193,16 @@ void MAIN {
             if (do_beta) {
                 ACQ();
                 add_bcast_rows_init_short();
-                cb_reserve_back(tt::CB::c_out0, blk);
+                cb_reserve_back(tt::CBIndex::c_16, blk);
                 cb_wait_front(cb_beta, wt+blk); // TODO: optimization - only wait on first ht
                 cb_wait_front(cb_fusion, blk);
                 for (uint32_t wtr = 0; wtr < blk; wtr++) {
                     add_tiles_bcast_rows(cb_fusion, cb_beta, wtr, wt+wtr, wtr); // tile *= 1/(sum(exp(x)))
-                    pack_tile(wtr, tt::CB::c_out0); // pack either to intermediate (cb_fusion or out0)
+                    pack_tile(wtr, tt::CBIndex::c_16); // pack either to intermediate (cb_fusion or out0)
                 }
                 cb_pop_front(cb_fusion, blk);
                 // We don't pop beta since it's 1,1,1,Wt and we reuse it for all NCHt
-                cb_push_back(tt::CB::c_out0, blk);
+                cb_push_back(tt::CBIndex::c_16, blk);
                 REL();
             }
         }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/softmax.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/softmax.cpp
index 7b594e63ea3..295aec15d4e 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/softmax.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/softmax.cpp
@@ -31,21 +31,21 @@ void MAIN {
     const uint32_t Wt = get_arg_val<uint32_t>(2);
     const uint32_t ndst = get_arg_val<uint32_t>(3);
     const uint32_t start_ht = get_arg_val<uint32_t>(4);
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_2);
 
     constexpr uint32_t onetile = 1;
     // reserve one tile for zeros on cb_in2
     // We only do the reserve for the intermediates once and use pack_tile
     // So effectively these are used as pre-allocated arrays
     // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_fused_scale = tt::CB::c_in3;
-    constexpr auto cb_fused_attn = tt::CB::c_in4;
-    constexpr auto cb_exps = tt::CB::c_intermed0;
-    constexpr auto cb_scale_mask = tt::CB::c_intermed3;
-    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_3;
+    constexpr auto cb_fused_attn = tt::CBIndex::c_4;
+    constexpr auto cb_exps = tt::CBIndex::c_24;
+    constexpr auto cb_scale_mask = tt::CBIndex::c_27;
+    constexpr auto cb_recipsumexps = tt::CBIndex::c_25;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
 
 
     cb_wait_front(cb_bcast_scaler, 1); // comes from the reader
@@ -150,13 +150,13 @@ void MAIN {
         mul_bcast_cols_init_short();
         for (uint32_t wt = 0; wt < Wt; wt += ndst) {
             ACQ();
-            cb_reserve_back(tt::CB::c_out0, ndst);
+            cb_reserve_back(tt::CBIndex::c_16, ndst);
             for (uint32_t wt8 = 0; wt8 < ndst; wt8++) {
                 // wt+wt8 since we pop Wt after the entire loop
                 mul_tiles_bcast<BroadcastType::COL>(cb_exps, cb_recipsumexps, wt+wt8, 0, wt8); // tile *= 1/(sum(exp(x)))
-                pack_tile(wt8, tt::CB::c_out0);
+                pack_tile(wt8, tt::CBIndex::c_16);
             }
-            cb_push_back(tt::CB::c_out0, ndst);
+            cb_push_back(tt::CBIndex::c_16, ndst);
             REL();
         }
         cb_pop_front(cb_recipsumexps, 1);
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp
index aea8beac4b4..dac81861895 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/tilize.cpp
@@ -13,23 +13,23 @@ void MAIN {
     uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
     uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
 #ifndef SHORT_INIT
-    tilize_init(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+    tilize_init(tt::CBIndex::c_0, per_core_block_tile_cnt, tt::CBIndex::c_16);
 #else
-    unary_op_init_common(tt::CB::c_in0, tt::CB::c_out0);
-    tilize_init_short(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+    unary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_16);
+    tilize_init_short(tt::CBIndex::c_0, per_core_block_tile_cnt, tt::CBIndex::c_16);
 #endif
 
     for(uint32_t b=0;b<per_core_block_cnt;++b)
     {
-        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
 
-        tilize_block(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+        tilize_block(tt::CBIndex::c_0, per_core_block_tile_cnt, tt::CBIndex::c_16);
 
-        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
     }
 
-    tilize_uninit(tt::CB::c_in0, tt::CB::c_out0);
+    tilize_uninit(tt::CBIndex::c_0, tt::CBIndex::c_16);
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/transformer_attn_matmul.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/transformer_attn_matmul.cpp
index b2e012fedaf..431e7678619 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/transformer_attn_matmul.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/transformer_attn_matmul.cpp
@@ -30,7 +30,7 @@ void MAIN {
 
     constexpr uint32_t num_rows_in_one_tile = 32;
 
-    mm_init(tt::CB::c_in0, tt::CB::c_in1, out_cb_id, transpose_hw);
+    mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, out_cb_id, transpose_hw);
 
     for (uint32_t nb = 0; nb < batch; nb++)
     for (uint32_t mt_C = 0; mt_C < Mt; ++mt_C) // output tile of C
@@ -40,13 +40,13 @@ void MAIN {
             acquire_dst();
             for (uint32_t kt = 0; kt < Kt; kt++) {
                 if (tile_row_id == 0) {
-                    cb_wait_front(tt::CB::c_in0, kt+1);
+                    cb_wait_front(tt::CBIndex::c_0, kt+1);
                 }
-                cb_wait_front(tt::CB::c_in1, onetile);
+                cb_wait_front(tt::CBIndex::c_1, onetile);
 
-                matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, kt, 0, 0, transpose_hw);
+                matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, kt, 0, 0, transpose_hw);
 
-                cb_pop_front(tt::CB::c_in1, onetile);
+                cb_pop_front(tt::CBIndex::c_1, onetile);
             }
 
             cb_reserve_back(cb_intermed0, onetile);
@@ -54,7 +54,7 @@ void MAIN {
             release_dst();
             cb_push_back(cb_intermed0, onetile);
 
-            // untilize tile and write to CB::c_intermed1
+            // untilize tile and write to CBIndex::c_25
             cb_wait_front(cb_intermed0, onetile);
             untilize_init_short(cb_intermed0);
             cb_reserve_back(cb_intermed1, 1);
@@ -64,15 +64,15 @@ void MAIN {
             cb_pop_front(cb_intermed0, 1);
             untilize_uninit(cb_intermed0);
 
-            mm_init_short(tt::CB::c_in0, tt::CB::c_in1, transpose_hw);
+            mm_init_short(tt::CBIndex::c_0, tt::CBIndex::c_1, transpose_hw);
         }
-        cb_pop_front(tt::CB::c_in0, Kt);
+        cb_pop_front(tt::CBIndex::c_0, Kt);
 
         // cb_intermed2 comes from reader; untilized row-major tile
         cb_wait_front(cb_intermed2, 1);
-        cb_reserve_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
 
-        // tilize CB::intermed2 and write to CB::c_out0
+        // tilize CB::intermed2 and write to CBIndex::c_16
         tilize_init_short(cb_intermed2, 1);
         tilize_block(cb_intermed2, 1, out_cb_id);
         cb_push_back(out_cb_id, 1);
@@ -80,7 +80,7 @@ void MAIN {
         cb_pop_front(cb_intermed2, 1);
         tilize_uninit(cb_intermed2);
 
-        mm_init_short(tt::CB::c_in0, tt::CB::c_in1, transpose_hw);
+        mm_init_short(tt::CBIndex::c_0, tt::CBIndex::c_1, transpose_hw);
     }
 
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp
index 3640908b9d4..18500fb20a3 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/transpose_wh.cpp
@@ -12,10 +12,10 @@ void MAIN {
 
     uint32_t NHtWt = get_compile_time_arg_val(0);
 #ifndef SHORT_INIT
-    transpose_wh_init(tt::CB::c_in0);
+    transpose_wh_init(tt::CBIndex::c_0);
 #else
-    unary_op_init_common(tt::CB::c_in0);
-    transpose_wh_init_short(tt::CB::c_in0);
+    unary_op_init_common(tt::CBIndex::c_0);
+    transpose_wh_init_short(tt::CBIndex::c_0);
 #endif
 
     // transpose a row-major block:
@@ -23,16 +23,16 @@ void MAIN {
     // - uses reader_unary_transpose_wh
     // - transpose_wh each tile
     for (uint32_t n = 0; n < NHtWt; n++) {
-        cb_wait_front(tt::CB::c_in0, 1);
-        cb_reserve_back(tt::CB::c_out0, 1);
+        cb_wait_front(tt::CBIndex::c_0, 1);
+        cb_reserve_back(tt::CBIndex::c_16, 1);
 
         acquire_dst();
-        transpose_wh_tile(tt::CB::c_in0, 0, 0);
-        pack_tile(0, tt::CB::c_out0);
+        transpose_wh_tile(tt::CBIndex::c_0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_16);
         release_dst();
 
-        cb_push_back(tt::CB::c_out0, 1);
-        cb_pop_front(tt::CB::c_in0, 1);
+        cb_push_back(tt::CBIndex::c_16, 1);
+        cb_pop_front(tt::CBIndex::c_0, 1);
     }
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unpack_tilizeA_B.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unpack_tilizeA_B.cpp
index 3dfecc21522..71e1bfcaa57 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/unpack_tilizeA_B.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unpack_tilizeA_B.cpp
@@ -31,26 +31,26 @@ void MAIN {
 
     uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
     uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
-    tilizeA_B_binary_init(tt::CB::c_in0, tt::CB::c_in1, per_core_block_tile_cnt, tt::CB::c_out0);
+    tilizeA_B_binary_init(tt::CBIndex::c_0, tt::CBIndex::c_1, per_core_block_tile_cnt, tt::CBIndex::c_16);
 
     for(uint32_t b=0;b<per_core_block_cnt;++b)
     {
-        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_wait_front(tt::CB::c_in1, per_core_block_tile_cnt);
-        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        unpack_tilizeA_B_block(tt::CB::c_in0, tt::CB::c_in1, per_core_block_tile_cnt, b);
+        cb_wait_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_1, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        unpack_tilizeA_B_block(tt::CBIndex::c_0, tt::CBIndex::c_1, per_core_block_tile_cnt, b);
 
         for(uint i=0; i<per_core_block_tile_cnt; ++i) {
             acquire_dst();
-            add_tiles_math(tt::CB::c_in0, tt::CB::c_in1, i, i, 0);
+            add_tiles_math(tt::CBIndex::c_0, tt::CBIndex::c_1, i, i, 0);
             // dprint_tensix_dest_reg(0);
-            pack_tile(0, tt::CB::c_out0);
+            pack_tile(0, tt::CBIndex::c_16);
             release_dst();
         }
 
-        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in1, per_core_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_1, per_core_block_tile_cnt);
     }
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unpack_untilize.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unpack_untilize.cpp
index f904b9b37c6..91bb6231672 100644
--- a/tests/tt_metal/tt_metal/test_kernels/compute/unpack_untilize.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/compute/unpack_untilize.cpp
@@ -13,22 +13,22 @@ void MAIN {
     uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
     uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
 #ifndef SHORT_INIT
-    untilize_init(tt::CB::c_in0);
+    untilize_init(tt::CBIndex::c_0);
 #else
-    unary_op_init_common(tt::CB::c_in0, tt::CB::c_out0);
-    untilize_init_short(tt::CB::c_in0);
+    unary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_16);
+    untilize_init_short(tt::CBIndex::c_0);
 #endif
 
     for(uint32_t b = 0; b < per_core_block_cnt; ++ b) {
-        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
 
-        untilize_block(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+        untilize_block(tt::CBIndex::c_0, per_core_block_tile_cnt, tt::CBIndex::c_16);
 
-        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
     }
 
-    untilize_uninit(tt::CB::c_in0);
+    untilize_uninit(tt::CBIndex::c_0);
 }
 }
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp
index 56ce1d1149a..7f909bfdfa8 100644
--- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp
@@ -11,7 +11,7 @@ void kernel_main() {
     uint32_t src_noc_y = get_arg_val<uint32_t>(2);
     uint32_t num_tiles = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     // ublocks size defined in tiles
     constexpr uint32_t ublock_size_tiles = 4;
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_receiver.cpp
index a21474a048a..e52bb827040 100644
--- a/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_receiver.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_receiver.cpp
@@ -232,7 +232,7 @@ void kernel_main() {
 
     advance_phase(remote_noc_info_desc, stream_state, stream_id);
 
-    auto cb = tt::CB::c_in0;
+    auto cb = tt::CBIndex::c_0;
     stream_state.local_buffer_base_addr = stream_buffer_addr;
 
     for (uint32_t i = 0; i < num_messages_to_forward; i++) {
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_receiver_writer.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_receiver_writer.cpp
index 470ef6a4264..18eb7b7927d 100644
--- a/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_receiver_writer.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_receiver_writer.cpp
@@ -21,7 +21,7 @@ void kernel_main() {
   const InterleavedAddrGen<true> dest_addr_gen = {
       .bank_base_address = output_buffer_addr, .page_size = write_page_size};
 
-  auto cb = tt::CB::c_in0;
+  auto cb = tt::CBIndex::c_0;
   for (uint32_t i = 0; i < num_pages; i++) {
     cb_wait_front(cb, 1);
     // NOTE THAT msg_hdr_size is doubled on host side to maintain alignment for DRAM reads/writes in THIS TEST ONLY
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_sender.cpp
index 606930d73ff..1052c0f1c3c 100644
--- a/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_sender.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_sender.cpp
@@ -315,7 +315,7 @@ void kernel_main() {
 
     hang_toggle(hang_toggle_semaphore);
 
-    auto cb = tt::CB::c_in0;
+    auto cb = tt::CBIndex::c_0;
     bool very_first_message = true;
 
     uint32_t message_id = 0;
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_sender_reader.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_sender_reader.cpp
index 2127013baac..312bbbc31c8 100644
--- a/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_sender_reader.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/streams/stream_relay_remote_sender_reader.cpp
@@ -29,7 +29,7 @@ void kernel_main() {
     const uint32_t read_page_size = cb_page_size - msg_hdr_size;
     const InterleavedAddrGen<true> src_addr_gen = {.bank_base_address = input_buffer_addr, .page_size = read_page_size};
 
-    auto cb = tt::CB::c_in0;
+    auto cb = tt::CBIndex::c_0;
 
     uint32_t sub_index = 0;
 
diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/brisc_print.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/brisc_print.cpp
index 0364e951131..19276d69312 100644
--- a/tests/tt_metal/tt_metal/test_kernels/misc/brisc_print.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/misc/brisc_print.cpp
@@ -11,7 +11,7 @@
 
 void kernel_main() {
     // Write some data to the CB that will be used to test TSLICE.
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
     cb_reserve_back(cb_id_in0, 1);
     auto ptr = reinterpret_cast<BF16*>(get_write_ptr(cb_id_in0));
     uint16_t bfloat16_base = 0x3dfb;
diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp
index dfebc6c9cf4..f88eb7ec631 100644
--- a/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp
@@ -31,7 +31,7 @@ namespace NAMESPACE {
 void MAIN {
 #endif
     // Read out the tile we want to print using BRISC, put it in c_in0
-    constexpr uint32_t cb_id = tt::CB::c_in0;
+    constexpr uint32_t cb_id = tt::CBIndex::c_0;
 #if defined(COMPILE_FOR_BRISC)
     uint32_t src_addr  = get_arg_val<uint32_t>(0);
     uint32_t src_noc_x = get_arg_val<uint32_t>(1);
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/incrementer.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/incrementer.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/incrementer.cpp
rename to tests/tt_metal/tt_metal/test_kernels/misc/sub_device/incrementer.cpp
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_remote_waiter.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_remote_waiter.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_remote_waiter.cpp
rename to tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_remote_waiter.cpp
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_waiter.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_waiter.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_waiter.cpp
rename to tests/tt_metal/tt_metal/test_kernels/misc/sub_device/persistent_waiter.cpp
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp
similarity index 100%
rename from tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp
rename to tests/tt_metal/tt_metal/test_kernels/misc/sub_device/syncer.cpp
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
index 1fdfe90b865..0d39fe1ac3e 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp
@@ -129,8 +129,8 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, tt_metal::KernelHandle> cr
         .set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {ouput_cb_index, tt::DataFormat::Float16_b},
         {interm0_cb_index, tt::DataFormat::Float16_b}
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
index 5aaec9b6e6c..0b46f32291e 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast.cpp
@@ -139,8 +139,8 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, tt_metal::KernelHandle, tt
                 .set_page_size(src1_cb_index, single_tile_size);
             auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-            uint32_t ouput_cb_index = 16; // output operands start at index 16
-            uint32_t interm0_cb_index = 24;
+            uint32_t ouput_cb_index = tt::CBIndex::c_16;
+            uint32_t interm0_cb_index = tt::CBIndex::c_24;
             std::map<uint8_t, tt::DataFormat> output_cb_data_format_spec = {
                 {ouput_cb_index, tt::DataFormat::Float16_b},
                 {interm0_cb_index, tt::DataFormat::Float16_b},
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
index a373f154e8e..6d651936f6e 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in0_mcast_in1_mcast.cpp
@@ -138,8 +138,8 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, tt_metal::KernelHandle, tt
         {(std::size_t)start_core_x + 1, (std::size_t)start_core_y + 1},
         {(std::size_t)start_core_x + num_cores_c - 1, (std::size_t)start_core_y + num_cores_r - 1});
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {ouput_cb_index, tt::DataFormat::Float16_b},
         {interm0_cb_index, tt::DataFormat::Float16_b}
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
index 320844cb6f6..7d485cf8688 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram_in1_mcast.cpp
@@ -122,8 +122,8 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, tt_metal::KernelHandle, tt
         {(std::size_t)start_core_x, (std::size_t)start_core_y + 1},
         {(std::size_t)start_core_x + num_cores_c - 1, (std::size_t)start_core_y + num_cores_r - 1});
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {ouput_cb_index, tt::DataFormat::Float16_b},
         {interm0_cb_index, tt::DataFormat::Float16_b}
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
index 9a82aeb6474..9297e1eba7a 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp
@@ -170,8 +170,8 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle , tt_metal::KernelHandle> c
     CoreCoord end_core = {(std::size_t)num_cores_c - 1, (std::size_t)num_cores_r - 1};
     CoreRange all_cores(start_core, end_core);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
-    uint32_t interm0_cb_index = 24;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
     std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
         {ouput_cb_index, tt::DataFormat::Float16_b},
         {interm0_cb_index, tt::DataFormat::Float16_b}
diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
index 51ba3d613f9..7b1e4dd30be 100644
--- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp
@@ -178,8 +178,8 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) {
         }
 
         // NOTE: intermediate and output CB share same address space since we operate it on it sequentially, not in parallel
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
-        uint32_t intermediate_cb_index = 24;
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
+        uint32_t intermediate_cb_index = tt::CBIndex::c_24;
         std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
             {ouput_cb_index, tt::DataFormat::Float16_b},
             {intermediate_cb_index, tt::DataFormat::Float16_b}
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
index e9c962c9bb7..58d975741ae 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp
@@ -226,8 +226,8 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
-        uint32_t interm0_cb_index = 24;
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
+        uint32_t interm0_cb_index = tt::CBIndex::c_24;
         std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
             {ouput_cb_index, tt::DataFormat::Float16_b},
             {interm0_cb_index, tt::DataFormat::Float16_b}
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
index cc92641abcf..4f56cd8998d 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp
@@ -227,8 +227,8 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
-        uint32_t interm0_cb_index = 24;
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
+        uint32_t interm0_cb_index = tt::CBIndex::c_24;
         std::map<uint8_t, tt::DataFormat> partials_and_out_data_format_spec = {
             {ouput_cb_index, tt::DataFormat::Float16_b},
             {interm0_cb_index, tt::DataFormat::Float16_b}
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
index 29be08e74a3..090e52f631f 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp
@@ -71,7 +71,7 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 2;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
index df3273a197b..e108096da97 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp
@@ -72,7 +72,7 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 1;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Bfp8_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
index 7c3c6379aae..89d46864115 100644
--- a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
+++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp
@@ -79,7 +79,7 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 1;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
index 7a24596843b..84e5cb959b1 100644
--- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
+++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp
@@ -39,7 +39,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, tt_metal::KernelHandle> cr
                 .set_page_size(src0_cb_index, single_tile_size);
             auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-            uint32_t ouput_cb_index = 16; // output operands start at index 16
+            uint32_t ouput_cb_index = tt::CBIndex::c_16;
             uint32_t num_output_tiles = 1;
             tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
                 .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_multiple_programs.cpp b/tests/tt_metal/tt_metal/test_multiple_programs.cpp
index 78d85be1acc..41a5de98edf 100644
--- a/tests/tt_metal/tt_metal/test_multiple_programs.cpp
+++ b/tests/tt_metal/tt_metal/test_multiple_programs.cpp
@@ -50,7 +50,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, tt_metal::KernelHandle> se
         .set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_size);
@@ -103,7 +103,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, tt_metal::KernelHandle> se
         .set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-    uint32_t ouput_cb_index = 16; // output operands start at index 16
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
index ec7fe06a8f5..d21051f913d 100644
--- a/tests/tt_metal/tt_metal/test_transpose_hc.cpp
+++ b/tests/tt_metal/tt_metal/test_transpose_hc.cpp
@@ -86,7 +86,7 @@ int main(int argc, char **argv) {
             .set_page_size(src0_cb_index, single_tile_bytes);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         // this buffer is used in writer_unary.cpp BRISC kernel
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_buffer_tiles * single_tile_bytes, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_bytes);
diff --git a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
index 6d783347e7b..a3ae4a9eaf5 100644
--- a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
+++ b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp
@@ -149,7 +149,7 @@ int main(int argc, char **argv) {
             .set_page_size(src1_cb_index, single_tile_size);
         auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        uint32_t ouput_cb_index = 16; // output operands start at index 16
+        uint32_t ouput_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = num_tiles_c;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
             .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt
deleted file mode 100644
index 863ee7786e1..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/CMakeLists.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-set(UNIT_TESTS_SRC
-    ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_free_list_allocator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/allocator/test_l1_banking_allocator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/basic/device.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/basic/initialize_semaphores.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/basic/runtime_args.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/basic/test_noc.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/basic/test_soc_descriptor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_banked.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_buffer_utils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_sharded_l1.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_simple_dram_buffer.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/buffer/test_simple_l1_buffer.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_allocation.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_creation.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer/test_CircularBuffer_non_blocking.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_golden_impls.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_reduce.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_single_core_binary_compute.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_single_core_matmul_compute.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_sfpu_compute.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_dropout_sfpu_compute.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_untilize_tilize.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_copy_block_matmul_partials.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_reconfig.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_transpose.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_broadcast.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/test_cumsum.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_adjacent.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_contains.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_intersects.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_iterator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRange_merge.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_construct.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_contains.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_intersects.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/core_coord/test_CoreRangeSet_merge.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/dram/direct.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/host_apis/test_tilize_untilize.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/basic_eth_kernels.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/buffer_movement_kernels.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/device_cluster_api.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/erisc_app_direct_send.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/ring_gather_kernels.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/global_semaphore/test_global_semaphores.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/tt_stl/test_any_range.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/tt_stl/slotmap.cpp
-)
-
-add_executable(
-    unit_tests
-    ${UNIT_TESTS_SRC}
-    $<TARGET_OBJECTS:unit_tests_common_o>
-)
-TT_ENABLE_UNITY_BUILD(unit_tests)
-add_executable(unit_tests_galaxy ${CMAKE_CURRENT_SOURCE_DIR}/ethernet/galaxy_cluster_api.cpp)
-
-target_link_libraries(
-    unit_tests
-    PRIVATE
-        test_metal_common_libs
-        Boost::smart_ptr
-)
-target_link_libraries(
-    unit_tests_galaxy
-    PRIVATE
-        test_metal_common_libs
-        Boost::smart_ptr
-)
-
-target_include_directories(
-    unit_tests
-    PRIVATE
-        ${PROJECT_SOURCE_DIR}
-        ${PROJECT_SOURCE_DIR}/tt_metal
-        ${PROJECT_SOURCE_DIR}/tests
-        ${CMAKE_CURRENT_SOURCE_DIR}
-        ${CMAKE_CURRENT_SOURCE_DIR}/common
-        ${CMAKE_CURRENT_SOURCE_DIR}/circular_buffer
-)
-target_include_directories(
-    unit_tests_galaxy
-    PRIVATE
-        ${PROJECT_SOURCE_DIR}
-        ${PROJECT_SOURCE_DIR}/tt_metal
-        ${PROJECT_SOURCE_DIR}/tests
-        ${CMAKE_CURRENT_SOURCE_DIR}
-        ${CMAKE_CURRENT_SOURCE_DIR}/common
-)
-
-set_target_properties(
-    unit_tests
-    unit_tests_galaxy
-    PROPERTIES
-        RUNTIME_OUTPUT_DIRECTORY
-            ${PROJECT_BINARY_DIR}/test/tt_metal
-)
diff --git a/tests/tt_metal/tt_metal/unit_tests/README.md b/tests/tt_metal/tt_metal/unit_tests/README.md
deleted file mode 100644
index 55aab607296..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Summary
-Unit testing uses the doctest framework.  See https://github.com/doctest/doctest/
-Generally, there are three main levels of organization:
-*  TEST_SUITE - Used to group main areas of tests
-*  TEST_CASE - How Test case and sub-case gets split up is at test-writer discretion, but see the test_case section
-*  SUB_CASE
-
-
-## Build && Execution
-### Build
-`make tests/tt_metal/unit_tests`
-### Get Help
-`./build/test/tt_metal/unit_tests --help`
-### Execute all tests
-`./build/test/tt_metal/unit_tests`
-### Execute filtered test-suite
-`./build/test/tt_metal/unit_tests -ts="*Sfpu*"`
-### List all test-suite with filter
-`./build/test/tt_metal/unit_tests -ts="*Sfpu*" -lts`
-
-## Folder Structure
-General structure of the tests are as follows, more sub-folders can be added
-<table><tr><td>
-Directory Structure - Please add any new-tests to a corresponding folder.
-</td></tr><td>
-<pre lang="">
-tt_metal/unit_tests/
-&nbsp;&nbsp;> test_main.cpp
-&nbsp;&nbsp;> basic/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Any basic test files can exist here, will be automatically added to test_main
-&nbsp;&nbsp;> common/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Used to hold any common structures across all test suites like fixtures
-&nbsp;&nbsp;> dram/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Any dram unit/stress test files can exist here, will be automatically added to test_main
-&nbsp;&nbsp;> compute/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Any basic test files can exist here, will be automatically added to test_main
-&nbsp;&nbsp;> new_folders/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Any test files can exist here, will be automatically added to test_main
-test_utils/
-&nbsp;&nbsp;> comparison.cpp # Useful utils for comparing, see example usages in unit tests
-&nbsp;&nbsp;> print_helpers.cpp # Useful utils for printin
-&nbsp;&nbsp;> stimulus.cpp # Useful utils for generating random vectors or specific vectors, see example usages in unit tests
-&nbsp;&nbsp;> tilization.cpp # Useful utils for converting between tiled vectors or not, see example usages in unit tests
-</td></tr></table>
diff --git a/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.hpp b/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.hpp
deleted file mode 100644
index e3b46a78266..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/buffer/test_buffer_utils.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "tt_metal/host_api.hpp"
-
-namespace tt::test::buffer::detail {
-void writeL1Backdoor(tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, std::vector<uint32_t>& data);
-void readL1Backdoor(tt::tt_metal::Device* device, CoreCoord coord, uint32_t address, uint32_t byte_size, std::vector<uint32_t>& data);
-void writeDramBackdoor(tt::tt_metal::Device* device, uint32_t channel, uint32_t address, std::vector<uint32_t>& data);
-void readDramBackdoor(
-    tt::tt_metal::Device* device, uint32_t channel, uint32_t address, uint32_t byte_size, std::vector<uint32_t>& data);
-}  // namespace tt::test::buffer::detail
diff --git a/tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp
deleted file mode 100644
index f4603b7ec37..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <gtest/gtest.h>
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-
-class BasicFixture : public ::testing::Test  {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (not slow_dispatch) {
-            TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set");
-            GTEST_SKIP();
-        }
-    }
-
-};
-
-class FDBasicFixture : public ::testing::Test  {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            TT_THROW("This suite can only be run with FD runtime");
-            GTEST_SKIP();
-        }
-    }
-
-};
diff --git a/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp
deleted file mode 100644
index 7e638470b1c..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp
+++ /dev/null
@@ -1,186 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <gtest/gtest.h>
-
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-
-class DeviceFixture : public ::testing::Test {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (not slow_dispatch) {
-            TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set");
-            GTEST_SKIP();
-        }
-        arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-
-        num_devices_ = tt::tt_metal::GetNumAvailableDevices();
-
-        // Some CI machines have lots of cards, running all tests on all cards is slow
-        // Coverage for multidevices is decent if we just confirm 2 work
-        if (arch_ == tt::ARCH::GRAYSKULL && num_devices_ > 2) {
-            num_devices_ = 2;
-        }
-
-        std::vector<chip_id_t> ids;
-        for (unsigned int id = 0; id < num_devices_; id++) {
-            ids.push_back(id);
-        }
-        const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-        tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-        devices_ = tt::DevicePool::instance().get_all_active_devices();
-    }
-
-    void TearDown() override {
-        tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(false);
-        for (unsigned int id = 0; id < devices_.size(); id++) {
-            if (devices_.at(id)->is_initialized()) {
-                tt::tt_metal::CloseDevice(devices_.at(id));
-            }
-        }
-    }
-
-    std::vector<tt::tt_metal::v1::DeviceHandle> devices_;
-    tt::ARCH arch_;
-    size_t num_devices_;
-};
-
-
-class DeviceSingleCardFixture : public ::testing::Test {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (not slow_dispatch) {
-            TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set");
-            GTEST_SKIP();
-        }
-        arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-
-        const chip_id_t mmio_device_id = 0;
-        reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id});
-        device_ = reserved_devices_.at(mmio_device_id);
-
-
-        num_devices_ = reserved_devices_.size();
-    }
-
-    void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }
-
-    tt::tt_metal::Device* device_;
-    std::map<chip_id_t, tt::tt_metal::Device*> reserved_devices_;
-    tt::ARCH arch_;
-    size_t num_devices_;
-};
-
-class BlackholeSingleCardFixture : public DeviceSingleCardFixture {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (not slow_dispatch) {
-            TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set");
-            GTEST_SKIP();
-        }
-        arch_ = tt::get_arch_from_string(tt::test_utils::get_env_arch_name());
-        if (arch_ != tt::ARCH::BLACKHOLE) {
-            GTEST_SKIP();
-        }
-
-        const chip_id_t mmio_device_id = 0;
-        reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id});
-        device_ = reserved_devices_.at(mmio_device_id);
-
-        num_devices_ = reserved_devices_.size();
-    }
-};
-
-class GalaxyFixture : public ::testing::Test {
-   protected:
-    void SkipTestSuiteIfNotGalaxyMotherboard()
-    {
-        const tt::ARCH arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
-        if (!(arch == tt::ARCH::WORMHOLE_B0 && num_devices >= 32))
-        {
-            GTEST_SKIP();
-        }
-    }
-
-    void InitializeDevices()
-    {
-        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
-        std::vector<chip_id_t> ids;
-        for (uint32_t id = 0; id < num_devices; id++)
-        {
-            ids.push_back(id);
-        }
-        this->device_ids_to_devices_ = tt::tt_metal::detail::CreateDevices(ids);
-        this->devices_ = tt::DevicePool::instance().get_all_active_devices();
-    }
-
-    void SetUp() override
-    {
-        this->SkipTestSuiteIfNotGalaxyMotherboard();
-        this->InitializeDevices();
-    }
-
-    void TearDown() override
-    {
-        tt::tt_metal::detail::CloseDevices(this->device_ids_to_devices_);
-        this->device_ids_to_devices_.clear();
-        this->devices_.clear();
-    }
-
-    std::vector<tt::tt_metal::v1::DeviceHandle> devices_;
-
-   private:
-    std::map<chip_id_t, Device*> device_ids_to_devices_;
-};
-
-class TGFixture : public GalaxyFixture
-{
-   protected:
-    void SkipTestSuiteIfNotTG()
-    {
-        this->SkipTestSuiteIfNotGalaxyMotherboard();
-        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
-        const size_t num_pcie_devices = tt::tt_metal::GetNumPCIeDevices();
-        if (!(num_devices == 32 && num_pcie_devices == 4))
-        {
-            GTEST_SKIP();
-        }
-    }
-
-    void SetUp() override
-    {
-        this->SkipTestSuiteIfNotTG();
-        this->InitializeDevices();
-    }
-};
-
-class TGGFixture : public GalaxyFixture
-{
-   protected:
-    void SkipTestSuiteIfNotTGG()
-    {
-        this->SkipTestSuiteIfNotGalaxyMotherboard();
-        const size_t num_devices = tt::tt_metal::GetNumAvailableDevices();
-        const size_t num_pcie_devices = tt::tt_metal::GetNumPCIeDevices();
-        if (!(num_devices == 64 && num_pcie_devices == 8))
-        {
-            GTEST_SKIP();
-        }
-    }
-
-    void SetUp() override
-    {
-        this->SkipTestSuiteIfNotTGG();
-        this->InitializeDevices();
-    }
-};
diff --git a/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp
deleted file mode 100644
index 08e57a5cb2a..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/common/n300_device_fixture.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <gtest/gtest.h>
-
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-
-class N300DeviceFixture : public ::testing::Test {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (not slow_dispatch) {
-            TT_THROW("This suite can only be run with TT_METAL_SLOW_DISPATCH_MODE set");
-            GTEST_SKIP();
-        }
-        arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-
-        num_devices_ = tt::tt_metal::GetNumAvailableDevices();
-        if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() == 2 and
-            tt::tt_metal::GetNumPCIeDevices() == 1) {
-            std::vector<chip_id_t> ids;
-            for (unsigned int id = 0; id < num_devices_; id++) {
-                ids.push_back(id);
-            }
-
-            const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-            tt::DevicePool::initialize(ids, 1, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-            devices_ = tt::DevicePool::instance().get_all_active_devices();
-
-        } else {
-            GTEST_SKIP();
-        }
-    }
-
-    void TearDown() override {
-        tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(false);
-        for (unsigned int id = 0; id < devices_.size(); id++) {
-            tt::tt_metal::CloseDevice(devices_.at(id));
-        }
-    }
-
-    std::vector<tt::tt_metal::v1::DeviceHandle> devices_;
-    tt::ARCH arch_;
-    size_t num_devices_;
-};
diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/basic_eth_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/ethernet/basic_eth_kernels.cpp
deleted file mode 100644
index 4b35868852e..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/ethernet/basic_eth_kernels.cpp
+++ /dev/null
@@ -1,895 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "device_fixture.hpp"
-#include "n300_device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
-
-using namespace tt;
-using namespace tt::test_utils;
-using namespace tt::test_utils::df;
-
-namespace {
-namespace CMAKE_UNIQUE_NAMESPACE {
-constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
-constexpr std::int32_t MAX_NUM_WORDS =
-    (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE;
-}
-}
-
-namespace unit_tests::erisc::kernels {
-
-const size_t get_rand_32_byte_aligned_address(const size_t& base, const size_t& max) {
-    TT_ASSERT(!(base & 0x1F) and !(max & 0x1F));
-    size_t word_size = (max >> 5) - (base >> 5);
-    return (((rand() % word_size) << 5) + base);
-}
-
-/*
- *                                         ███╗░░██╗░█████╗░░█████╗░
- *                                         ████╗░██║██╔══██╗██╔══██╗
- *                                         ██╔██╗██║██║░░██║██║░░╚═╝
- *                                         ██║╚████║██║░░██║██║░░██╗
- *                                         ██║░╚███║╚█████╔╝╚█████╔╝
- *                                         ╚═╝░░╚══╝░╚════╝░░╚════╝░
- */
-
-bool reader_kernel_no_send(
-    tt_metal::Device* device,
-    const size_t& byte_size,
-    const size_t& eth_l1_byte_address,
-    const CoreCoord& eth_reader_core,
-    const tt_metal::EthernetConfig &ethernet_config = tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}) {
-    bool pass = true;
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Application Setup
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program program = tt_metal::Program();
-
-    tt::tt_metal::InterleavedBufferConfig dram_config{
-                    .device=device,
-                    .size = byte_size,
-                    .page_size = byte_size,
-                    .buffer_type = tt::tt_metal::BufferType::DRAM
-        };
-
-    auto input_dram_buffer = CreateBuffer(dram_config);
-    uint32_t dram_byte_address = input_dram_buffer->address();
-    auto dram_noc_xy = input_dram_buffer->noc_coordinates();
-    auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core);
-    log_debug(
-        tt::LogTest,
-        "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}",
-        device->id(),
-        byte_size,
-        dram_noc_xy.str(),
-        dram_byte_address,
-        eth_reader_core.str(),
-        eth_l1_byte_address);
-
-    auto eth_reader_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp",
-        eth_reader_core,
-        ethernet_config);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs);
-
-    // Clear expected value at ethernet L1 address
-    std::vector<uint32_t> all_zeros(inputs.size(), 0);
-    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_l1_byte_address);
-
-    tt_metal::SetRuntimeArgs(
-        program,
-        eth_reader_kernel,
-        eth_reader_core,
-        {
-            (uint32_t)dram_byte_address,
-            (uint32_t)dram_noc_xy.x,
-            (uint32_t)dram_noc_xy.y,
-            (uint32_t)byte_size,
-            (uint32_t)eth_l1_byte_address,
-        });
-
-    tt_metal::detail::LaunchProgram(device, program);
-
-    auto readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_l1_byte_address, byte_size);
-    pass &= (readback_vec == inputs);
-    if (not pass) {
-        std::cout << "Mismatch at Core: " << eth_noc_xy.str() << std::endl;
-    }
-    return pass;
-}
-
-bool writer_kernel_no_receive(
-    tt_metal::Device* device,
-    const size_t& byte_size,
-    const size_t& eth_l1_byte_address,
-    const CoreCoord& eth_writer_core,
-    const tt_metal::EthernetConfig &ethernet_config = tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0}) {
-    bool pass = true;
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Application Setup
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program program = tt_metal::Program();
-
-    tt::tt_metal::InterleavedBufferConfig dram_config{
-                    .device=device,
-                    .size = byte_size,
-                    .page_size = byte_size,
-                    .buffer_type = tt::tt_metal::BufferType::DRAM
-        };
-
-    auto output_dram_buffer = CreateBuffer(dram_config);
-    uint32_t dram_byte_address = output_dram_buffer->address();
-    auto dram_noc_xy = output_dram_buffer->noc_coordinates();
-    auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core);
-    log_debug(
-        tt::LogTest,
-        "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}",
-        device->id(),
-        byte_size,
-        eth_writer_core.str(),
-        eth_l1_byte_address,
-        dram_noc_xy.str(),
-        dram_byte_address);
-
-    auto eth_writer_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp",
-        eth_writer_core,
-        ethernet_config);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, inputs, eth_l1_byte_address);
-
-    // Clear expected value at ethernet L1 address
-    std::vector<uint32_t> all_zeros(inputs.size(), 0);
-    tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros);
-
-    tt_metal::SetRuntimeArgs(
-        program,
-        eth_writer_kernel,
-        eth_writer_core,
-        {
-            (uint32_t)dram_byte_address,
-            (uint32_t)dram_noc_xy.x,
-            (uint32_t)dram_noc_xy.y,
-            (uint32_t)byte_size,
-            (uint32_t)eth_l1_byte_address,
-        });
-
-    tt_metal::detail::LaunchProgram(device, program);
-
-    auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size);
-    pass &= (readback_vec == inputs);
-    if (not pass) {
-        std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl;
-    }
-    return pass;
-}
-
-bool noc_reader_and_writer_kernels(
-    tt_metal::Device *device,
-    const uint32_t byte_size,
-    const uint32_t eth_dst_l1_address,
-    const uint32_t eth_src_l1_address,
-    const CoreCoord &logical_eth_core,
-    const tt_metal::EthernetConfig &reader_eth_config,
-    const tt_metal::EthernetConfig &writer_eth_config) {
-    bool pass = true;
-
-    tt_metal::Program program = tt_metal::Program();
-
-    tt_metal::InterleavedBufferConfig dram_config{
-        .device=device,
-        .size = byte_size,
-        .page_size = byte_size,
-        .buffer_type = tt_metal::BufferType::DRAM
-    };
-
-    auto reader_dram_buffer = CreateBuffer(dram_config);
-    auto writer_dram_buffer = CreateBuffer(dram_config);
-
-    auto reader_dram_noc_xy = reader_dram_buffer->noc_coordinates();
-    auto writer_dram_noc_xy = writer_dram_buffer->noc_coordinates();
-
-    log_debug(
-        tt::LogTest,
-        "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}",
-        device->id(),
-        byte_size,
-        reader_dram_noc_xy.str(),
-        reader_dram_buffer->address(),
-        logical_eth_core.str(),
-        eth_dst_l1_address);
-    log_debug(
-        tt::LogTest,
-        "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}",
-        device->id(),
-        byte_size,
-        logical_eth_core.str(),
-        eth_src_l1_address,
-        writer_dram_noc_xy.str(),
-        writer_dram_buffer->address());
-
-    auto eth_noc_xy = device->ethernet_core_from_logical_core(logical_eth_core);
-
-    auto eth_reader_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp",
-        logical_eth_core,
-        reader_eth_config);
-
-    tt_metal::SetRuntimeArgs(
-        program,
-        eth_reader_kernel,
-        logical_eth_core,
-        {
-            (uint32_t)reader_dram_buffer->address(),
-            (uint32_t)reader_dram_noc_xy.x,
-            (uint32_t)reader_dram_noc_xy.y,
-            (uint32_t)byte_size,
-            (uint32_t)eth_dst_l1_address,
-        });
-
-    auto eth_writer_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp",
-        logical_eth_core,
-        writer_eth_config);
-
-    tt_metal::SetRuntimeArgs(
-        program,
-        eth_writer_kernel,
-        logical_eth_core,
-        {
-            (uint32_t)writer_dram_buffer->address(),
-            (uint32_t)writer_dram_noc_xy.x,
-            (uint32_t)writer_dram_noc_xy.y,
-            (uint32_t)byte_size,
-            (uint32_t)eth_src_l1_address,
-        });
-
-    auto reader_inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    tt_metal::detail::WriteToBuffer(reader_dram_buffer, reader_inputs);
-
-    auto writer_inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, writer_inputs, eth_src_l1_address);
-
-    // Clear expected values at output locations
-    std::vector<uint32_t> all_zeros(byte_size / sizeof(uint32_t), 0);
-    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_dst_l1_address);
-    tt_metal::detail::WriteToBuffer(writer_dram_buffer, all_zeros);
-
-    tt_metal::detail::LaunchProgram(device, program);
-
-    auto eth_readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_dst_l1_address, byte_size);
-    pass &= (eth_readback_vec == reader_inputs);
-    if (not pass) {
-        log_info(tt::LogTest, "Mismatch at eth core: {}, eth kernel read incorrect values from DRAM", logical_eth_core.str());
-    }
-    std::vector<uint32_t> dram_readback_vec;
-    tt_metal::detail::ReadFromBuffer(writer_dram_buffer, dram_readback_vec);
-    pass &= (dram_readback_vec == writer_inputs);
-    if (not pass) {
-        log_info(tt::LogTest, "Mismatch at eth core: {}, eth kernel wrote incorrect values to DRAM", logical_eth_core.str());
-    }
-
-    return pass;
-}
-
-TEST_F(N300DeviceFixture, EthKernelsNocReadNoSend) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    GTEST_SKIP();
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-
-    for (const auto& eth_core : device_0->get_active_ethernet_cores(true)) {
-        ASSERT_TRUE(
-            unit_tests::erisc::kernels::reader_kernel_no_send(device_0, WORD_SIZE, src_eth_l1_byte_address, eth_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
-            device_0, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
-            device_0, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
-    }
-
-    for (const auto& eth_core : device_1->get_active_ethernet_cores(true)) {
-        ASSERT_TRUE(
-            unit_tests::erisc::kernels::reader_kernel_no_send(device_1, WORD_SIZE, src_eth_l1_byte_address, eth_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
-            device_1, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
-            device_1, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
-    }
-}
-
-TEST_F(N300DeviceFixture, EthKernelsNocWriteNoReceive) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    GTEST_SKIP();
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-
-    for (const auto& eth_core : device_0->get_active_ethernet_cores(true)) {
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_0, WORD_SIZE, src_eth_l1_byte_address, eth_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_0, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_0, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
-    }
-
-    for (const auto& eth_core : device_1->get_active_ethernet_cores(true)) {
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_1, WORD_SIZE, src_eth_l1_byte_address, eth_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_1, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_1, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
-    }
-}
-
-/*
- *
- *                                         ███████╗████████╗██╗░░██╗
- *                                         ██╔════╝╚══██╔══╝██║░░██║
- *                                         █████╗░░░░░██║░░░███████║
- *                                         ██╔══╝░░░░░██║░░░██╔══██║
- *                                         ███████╗░░░██║░░░██║░░██║
- *                                         ╚══════╝░░░╚═╝░░░╚═╝░░╚═╝
- */
-bool eth_direct_sender_receiver_kernels(
-    tt_metal::Device* sender_device,
-    tt_metal::Device* receiver_device,
-    const size_t& byte_size,
-    const size_t& src_eth_l1_byte_address,
-    const size_t& dst_eth_l1_byte_address,
-    const CoreCoord& eth_sender_core,
-    const CoreCoord& eth_receiver_core,
-    uint32_t num_bytes_per_send = 16) {
-    bool pass = true;
-    log_debug(
-        tt::LogTest,
-        "Sending {} bytes from device {} eth core {} addr {} to device {} eth core {} addr {}",
-        byte_size,
-        sender_device->id(),
-        eth_sender_core.str(),
-        src_eth_l1_byte_address,
-        receiver_device->id(),
-        eth_receiver_core.str(),
-        dst_eth_l1_byte_address);
-    // Generate inputs
-    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    llrt::write_hex_vec_to_core(
-        sender_device->id(),
-        sender_device->ethernet_core_from_logical_core(eth_sender_core),
-        inputs,
-        src_eth_l1_byte_address);
-
-    // Clear expected value at ethernet L1 address
-    std::vector<uint32_t> all_zeros(inputs.size(), 0);
-    llrt::write_hex_vec_to_core(
-        receiver_device->id(),
-        receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
-        all_zeros,
-        dst_eth_l1_byte_address);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Sender Device
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program sender_program = tt_metal::Program();
-
-    auto eth_sender_kernel = tt_metal::CreateKernel(
-        sender_program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_send.cpp",
-        eth_sender_core,
-        tt_metal::EthernetConfig{
-            .noc = tt_metal::NOC::NOC_0,
-            .compile_args = {uint32_t(num_bytes_per_send), uint32_t(num_bytes_per_send >> 4)}});
-
-    tt_metal::SetRuntimeArgs(
-        sender_program,
-        eth_sender_kernel,
-        eth_sender_core,
-        {
-            (uint32_t)src_eth_l1_byte_address,
-            (uint32_t)dst_eth_l1_byte_address,
-            (uint32_t)byte_size,
-        });
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Receiver Device
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program receiver_program = tt_metal::Program();
-
-    auto eth_receiver_kernel = tt_metal::CreateKernel(
-        receiver_program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_receive.cpp",
-        eth_receiver_core,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0});  // probably want to use NOC_1 here
-
-    tt_metal::SetRuntimeArgs(
-        receiver_program,
-        eth_receiver_kernel,
-        eth_receiver_core,
-        {
-            (uint32_t)byte_size,
-        });
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Execute Programs
-    ////////////////////////////////////////////////////////////////////////////
-
-    std::thread th1 = std::thread([&] {
-        tt_metal::detail::LaunchProgram(sender_device, sender_program);
-    });
-    std::thread th2 = std::thread([&] {
-        tt_metal::detail::LaunchProgram(receiver_device, receiver_program);
-    });
-
-    th1.join();
-    th2.join();
-    // tt_metal::ReadFromBuffer(l1_buffer, dest_core_data);
-    auto readback_vec = llrt::read_hex_vec_from_core(
-        receiver_device->id(),
-        receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
-        dst_eth_l1_byte_address,
-        byte_size);
-    pass &= (readback_vec == inputs);
-    if (not pass) {
-        std::cout << "Mismatch at Core: " << eth_receiver_core.str() << std::endl;
-        std::cout << readback_vec[0] << std::endl;
-    }
-    return pass;
-}
-
-
-
-}  // namespace unit_tests::erisc::kernels
-
-TEST_F(N300DeviceFixture, EthKernelsDirectSendChip0ToChip1) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    GTEST_SKIP();
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-
-    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
-        auto [device_id, receiver_core] = device_0->get_connected_ethernet_core(sender_core);
-        if (device_1->id() != device_id) {
-            continue;
-        }
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_0,
-            device_1,
-            WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_0,
-            device_1,
-            4 * WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_0,
-            device_1,
-            256 * WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_0,
-            device_1,
-            1000 * WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-    }
-}
-
-TEST_F(N300DeviceFixture, EthKernelsDirectSendChip1ToChip0) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    GTEST_SKIP();
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-
-    for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) {
-        auto [device_id, receiver_core] = device_1->get_connected_ethernet_core(sender_core);
-        if (device_0->id() != device_id) {
-            continue;
-        }
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_1,
-            device_0,
-            WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_1,
-            device_0,
-            4 * WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_1,
-            device_0,
-            256 * WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_1,
-            device_0,
-            1000 * WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-    }
-}
-
-TEST_F(DeviceFixture, EthKernelsDirectSendAllConnectedChips) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    for (const auto& sender_device : devices_) {
-        for (const auto& receiver_device : devices_) {
-            if (sender_device->id() == receiver_device->id()) {
-                continue;
-            }
-            for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) {
-                auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core);
-                if (receiver_device->id() != device_id) {
-                    continue;
-                }
-                ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                    sender_device,
-                    receiver_device,
-                    WORD_SIZE,
-                    src_eth_l1_byte_address,
-                    dst_eth_l1_byte_address,
-                    sender_core,
-                    receiver_core));
-                ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                    sender_device,
-                    receiver_device,
-                    4 * WORD_SIZE,
-                    src_eth_l1_byte_address,
-                    dst_eth_l1_byte_address,
-                    sender_core,
-                    receiver_core));
-                ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                    sender_device,
-                    receiver_device,
-                    256 * WORD_SIZE,
-                    src_eth_l1_byte_address,
-                    dst_eth_l1_byte_address,
-                    sender_core,
-                    receiver_core));
-                ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                    sender_device,
-                    receiver_device,
-                    1000 * WORD_SIZE,
-                    src_eth_l1_byte_address,
-                    dst_eth_l1_byte_address,
-                    sender_core,
-                    receiver_core));
-            }
-        }
-    }
-}
-
-TEST_F(N300DeviceFixture, EthKernelsBidirectionalDirectSend) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-
-    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
-        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_0,
-            device_1,
-            WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_1,
-            device_0,
-            WORD_SIZE,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            receiver_core,
-            sender_core));
-    }
-    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
-        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_0,
-            device_1,
-            WORD_SIZE * 256,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_1,
-            device_0,
-            WORD_SIZE * 256,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            receiver_core,
-            sender_core));
-    }
-    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
-        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_0,
-            device_1,
-            WORD_SIZE * 1024,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_1,
-            device_0,
-            WORD_SIZE * 1024,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            receiver_core,
-            sender_core));
-    }
-    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
-        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_0,
-            device_1,
-            WORD_SIZE * MAX_NUM_WORDS,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            device_1,
-            device_0,
-            WORD_SIZE * MAX_NUM_WORDS,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            receiver_core,
-            sender_core));
-    }
-}
-
-TEST_F(N300DeviceFixture, EthKernelsRepeatedDirectSends) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-
-    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
-        CoreCoord receiver_core = std::get<1>(device_0->get_connected_ethernet_core(sender_core));
-        for (int i = 0; i < 10; i++) {
-            ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                device_0,
-                device_1,
-                WORD_SIZE,
-                src_eth_l1_byte_address + WORD_SIZE * i,
-                dst_eth_l1_byte_address + WORD_SIZE * i,
-                sender_core,
-                receiver_core));
-        }
-        for (int i = 0; i < 10; i++) {
-            ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                device_1,
-                device_0,
-                WORD_SIZE,
-                src_eth_l1_byte_address + WORD_SIZE * i,
-                dst_eth_l1_byte_address + WORD_SIZE * i,
-                receiver_core,
-                sender_core));
-        }
-    }
-}
-
-TEST_F(N300DeviceFixture, EthKernelsRandomDirectSendTests) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    srand(0);
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-
-    std::map<std::tuple<int, CoreCoord>, std::tuple<int, CoreCoord>> connectivity = {};
-    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
-        const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core);
-        connectivity.insert({{0, sender_core}, receiver_core});
-    }
-    for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) {
-        const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core);
-        connectivity.insert({{1, sender_core}, receiver_core});
-    }
-    for (int i = 0; i < 1000; i++) {
-        auto it = connectivity.begin();
-        std::advance(it, rand() % (connectivity.size()));
-
-        const auto& send_chip = devices_.at(std::get<0>(it->first));
-        CoreCoord sender_core = std::get<1>(it->first);
-        const auto& receiver_chip = devices_.at(std::get<0>(it->second));
-        CoreCoord receiver_core = std::get<1>(it->second);
-
-        const size_t src_eth_l1_byte_address = unit_tests::erisc::kernels::get_rand_32_byte_aligned_address(
-            eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE);
-        const size_t dst_eth_l1_byte_address = unit_tests::erisc::kernels::get_rand_32_byte_aligned_address(
-            eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, eth_l1_mem::address_map::MAX_L1_LOADING_SIZE);
-
-        int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE -
-                         std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) /
-                        WORD_SIZE;
-        int num_words = rand() % max_words + 1;
-
-        ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-            send_chip,
-            receiver_chip,
-            WORD_SIZE * num_words,
-            src_eth_l1_byte_address,
-            dst_eth_l1_byte_address,
-            sender_core,
-            receiver_core));
-    }
-}
-TEST_F(N300DeviceFixture, EthKernelsRandomEthPacketSizeDirectSendTests) {
-    srand(0);
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-
-    std::map<std::tuple<int, CoreCoord>, std::tuple<int, CoreCoord>> connectivity = {};
-    for (const auto& sender_core : device_0->get_active_ethernet_cores(true)) {
-        const auto& receiver_core = device_0->get_connected_ethernet_core(sender_core);
-        connectivity.insert({{0, sender_core}, receiver_core});
-    }
-    for (const auto& sender_core : device_1->get_active_ethernet_cores(true)) {
-        const auto& receiver_core = device_1->get_connected_ethernet_core(sender_core);
-        connectivity.insert({{1, sender_core}, receiver_core});
-    }
-    std::vector<uint32_t> num_bytes_per_send_test_vals = {
-        16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536};
-    for (const auto& num_bytes_per_send : num_bytes_per_send_test_vals) {
-        log_info(tt::LogTest, "Random eth send tests with {} bytes per packet", num_bytes_per_send);
-        for (int i = 0; i < 10; i++) {
-            auto it = connectivity.begin();
-            std::advance(it, rand() % (connectivity.size()));
-
-            const auto& send_chip = devices_.at(std::get<0>(it->first));
-            CoreCoord sender_core = std::get<1>(it->first);
-            const auto& receiver_chip = devices_.at(std::get<0>(it->second));
-            CoreCoord receiver_core = std::get<1>(it->second);
-
-            const size_t src_eth_l1_byte_address = unit_tests::erisc::kernels::get_rand_32_byte_aligned_address(
-                eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE,
-                eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - 65536);
-            const size_t dst_eth_l1_byte_address = unit_tests::erisc::kernels::get_rand_32_byte_aligned_address(
-                eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE,
-                eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - 65536);
-
-            int max_words = (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE -
-                             std::max(src_eth_l1_byte_address, dst_eth_l1_byte_address)) /
-                            num_bytes_per_send;
-            int num_words = rand() % max_words + 1;
-
-            ASSERT_TRUE(unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                send_chip,
-                receiver_chip,
-                num_bytes_per_send * num_words,
-                src_eth_l1_byte_address,
-                dst_eth_l1_byte_address,
-                sender_core,
-                receiver_core,
-                num_bytes_per_send));
-        }
-    }
-}
-
-// TODO #14640: Run this on WH when i$ flush issue is addressed
-TEST_F(BlackholeSingleCardFixture, EthKernelOnIdleErisc0) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    uint32_t eth_l1_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED);
-    tt_metal::EthernetConfig noc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_0};
-    tt_metal::EthernetConfig noc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_1, .processor = tt_metal::DataMovementProcessor::RISCV_0};
-
-    for (const auto& eth_core : device_->get_inactive_ethernet_cores()) {
-        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
-            device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config));
-        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
-            device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config));
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config));
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config));
-    }
-}
-
-TEST_F(BlackholeSingleCardFixture, EthKernelOnIdleErisc1) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    uint32_t eth_l1_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED);
-    tt_metal::EthernetConfig noc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_1};
-    tt_metal::EthernetConfig noc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_1, .processor = tt_metal::DataMovementProcessor::RISCV_1};
-
-    for (const auto& eth_core : device_->get_inactive_ethernet_cores()) {
-        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
-            device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config));
-        ASSERT_TRUE(unit_tests::erisc::kernels::reader_kernel_no_send(
-            device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config));
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc0_ethernet_config));
-        ASSERT_TRUE(unit_tests::erisc::kernels::writer_kernel_no_receive(
-            device_, WORD_SIZE * 2048, eth_l1_address, eth_core, noc1_ethernet_config));
-    }
-}
-
-TEST_F(BlackholeSingleCardFixture, EthKernelOnBothIdleEriscs) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    uint32_t read_write_size_bytes = WORD_SIZE * 2048;
-    uint32_t reader_dst_address = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::UNRESERVED);
-    uint32_t writer_src_address = reader_dst_address + read_write_size_bytes;
-    tt_metal::EthernetConfig erisc0_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_0};
-    tt_metal::EthernetConfig erisc1_ethernet_config{.eth_mode = Eth::IDLE, .noc = tt_metal::NOC::NOC_0, .processor = tt_metal::DataMovementProcessor::RISCV_1};
-
-    for (const auto& eth_core : device_->get_inactive_ethernet_cores()) {
-        ASSERT_TRUE(unit_tests::erisc::kernels::noc_reader_and_writer_kernels(
-            device_, read_write_size_bytes, reader_dst_address, writer_src_address, eth_core, erisc0_ethernet_config, erisc1_ethernet_config
-        ));
-        erisc0_ethernet_config.noc = tt_metal::NOC::NOC_1;
-        erisc1_ethernet_config.noc = tt_metal::NOC::NOC_1;
-        ASSERT_TRUE(unit_tests::erisc::kernels::noc_reader_and_writer_kernels(
-            device_, read_write_size_bytes, reader_dst_address, writer_src_address, eth_core, erisc0_ethernet_config, erisc1_ethernet_config
-        ));
-    }
-}
diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/erisc_app_direct_send.cpp b/tests/tt_metal/tt_metal/unit_tests/ethernet/erisc_app_direct_send.cpp
deleted file mode 100644
index 01f63153840..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/ethernet/erisc_app_direct_send.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "n300_device_fixture.hpp"
-#include "tt_metal/common/logger.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
-
-namespace {
-namespace CMAKE_UNIQUE_NAMESPACE {
-constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
-constexpr std::int32_t MAX_NUM_WORDS = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_SIZE / WORD_SIZE;
-
-struct erisc_info_t {
-  volatile uint32_t num_bytes;
-  volatile uint32_t mode;
-  volatile uint32_t reserved_0_;
-  volatile uint32_t reserved_1_;
-  volatile uint32_t bytes_done;
-  volatile uint32_t reserverd_2_;
-  volatile uint32_t reserverd_3_;
-  volatile uint32_t reserverd_4_;
-};
-}
-}
-
-using namespace tt;
-using namespace tt::test_utils;
-using namespace tt::test_utils::df;
-
-namespace unit_tests::erisc::direct_send {
-// Tests ethernet direct send/receive from ERISC_L1_UNRESERVED_BASE
-bool send_over_eth(
-    tt_metal::Device* sender_device,
-    tt_metal::Device* receiver_device,
-    const CoreCoord& sender_core,
-    const CoreCoord& receiver_core,
-    const size_t& byte_size) {
-    tt::log_debug(
-        tt::LogTest,
-        "Running direct send test with sender chip {} core {}, receiver chip {} core {}, sending {} bytes",
-        sender_device->id(),
-        sender_core.str(),
-        receiver_device->id(),
-        receiver_core.str(),
-        byte_size);
-    std::vector<CoreCoord> eth_cores = {
-        CoreCoord(9, 0),
-        CoreCoord(1, 0),
-        CoreCoord(8, 0),
-        CoreCoord(2, 0),
-        CoreCoord(9, 6),
-        CoreCoord(1, 6),
-        CoreCoord(8, 6),
-        CoreCoord(2, 6),
-        CoreCoord(7, 0),
-        CoreCoord(3, 0),
-        CoreCoord(6, 0),
-        CoreCoord(4, 0),
-        CoreCoord(7, 6),
-        CoreCoord(3, 6),
-        CoreCoord(6, 6),
-        CoreCoord(4, 6)};
-
-    // Disable all eth core runtime app flags, zero out data write counter
-    std::vector<uint32_t> run_test_app_flag = {0x0};
-    for (const auto& eth_core : eth_cores) {
-        llrt::write_hex_vec_to_core(
-            sender_device->id(), eth_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
-        llrt::write_hex_vec_to_core(
-            receiver_device->id(), eth_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
-        std::vector<uint32_t> zero = {0, 0, 0, 0, 0, 0, 0, 0};
-        llrt::write_hex_vec_to_core(
-            sender_device->id(), eth_core, zero, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
-        llrt::write_hex_vec_to_core(
-            receiver_device->id(), eth_core, zero, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
-    }
-
-    // TODO: is it possible that receiver core app is stil running when we push inputs here???
-    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    llrt::write_hex_vec_to_core(
-        sender_device->id(), sender_core, inputs, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
-
-    // Zero out receiving address to ensure no stale data is causing tests to pass
-    std::vector<uint32_t> all_zeros(inputs.size(), 0);
-    llrt::write_hex_vec_to_core(
-        receiver_device->id(), receiver_core, all_zeros, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
-
-    std::vector<uint32_t> args_0 = {uint32_t(byte_size), 0};
-    llrt::write_hex_vec_to_core(sender_device->id(), sender_core, args_0, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
-    std::vector<uint32_t> args_1 = {uint32_t(byte_size), 1};
-    llrt::write_hex_vec_to_core(receiver_device->id(), receiver_core, args_1, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE);
-
-    // TODO: this should be updated to use kernel api
-    uint32_t active_eth_index = hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH);
-    ll_api::memory binary_mem_send = llrt::get_risc_binary(
-        sender_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0);
-    ll_api::memory binary_mem_receive = llrt::get_risc_binary(
-        receiver_device->build_firmware_target_path(active_eth_index, 0, 0), active_eth_index, 0, 0);
-
-    for (const auto& eth_core : eth_cores) {
-        llrt::write_hex_vec_to_core(
-            sender_device->id(), eth_core, binary_mem_send.data(), eth_l1_mem::address_map::FIRMWARE_BASE);
-        llrt::write_hex_vec_to_core(
-            receiver_device->id(), eth_core, binary_mem_receive.data(), eth_l1_mem::address_map::FIRMWARE_BASE);
-    }
-
-    // Activate sender core runtime app
-    run_test_app_flag = {0x1};
-    // send remote first, otherwise eth core may be blocked, very ugly for now...
-    if (receiver_device->id() == 1) {
-        llrt::write_hex_vec_to_core(
-            1, receiver_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
-    } else {
-        llrt::write_hex_vec_to_core(1, sender_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
-    }
-    if (sender_device->id() == 0) {
-        llrt::write_hex_vec_to_core(0, sender_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
-    } else {
-        llrt::write_hex_vec_to_core(
-            0, receiver_core, run_test_app_flag, eth_l1_mem::address_map::LAUNCH_ERISC_APP_FLAG);
-    }
-
-    bool pass = true;
-    auto readback_vec = llrt::read_hex_vec_from_core(
-        receiver_device->id(), receiver_core, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, byte_size);
-    pass &= (readback_vec == inputs);
-
-    return pass;
-}
-
-}  // namespace unit_tests::erisc::direct_send
-
-TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip0ToChip1) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    GTEST_SKIP();
-    ASSERT_TRUE(this->num_devices_ == 2);
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-    CoreCoord sender_core_0 = CoreCoord(9, 6);
-    CoreCoord sender_core_1 = CoreCoord(1, 6);
-
-    CoreCoord receiver_core_0 = CoreCoord(9, 0);
-    CoreCoord receiver_core_1 = CoreCoord(1, 0);
-
-    ASSERT_TRUE(
-        unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE));
-    ASSERT_TRUE(
-        unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 256));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 256));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 1024));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 1024));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS));
-}
-
-TEST_F(N300DeviceFixture, SingleEthCoreDirectSendChip1ToChip0) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    GTEST_SKIP();
-    ASSERT_TRUE(this->num_devices_ == 2);
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-    CoreCoord sender_core_0 = CoreCoord(9, 0);
-    CoreCoord sender_core_1 = CoreCoord(1, 0);
-
-    CoreCoord receiver_core_0 = CoreCoord(9, 6);
-    CoreCoord receiver_core_1 = CoreCoord(1, 6);
-
-    ASSERT_TRUE(
-        unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE));
-    ASSERT_TRUE(
-        unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * 256));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * 256));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * 1024));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * 1024));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS));
-}
-
-TEST_F(N300DeviceFixture, BidirectionalEthCoreDirectSend) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    GTEST_SKIP();
-    ASSERT_TRUE(this->num_devices_ == 2);
-    const auto& device_0 = devices_.at(0);
-    const auto& device_1 = devices_.at(1);
-    CoreCoord sender_core_0 = CoreCoord(9, 6);
-    CoreCoord sender_core_1 = CoreCoord(1, 6);
-
-    CoreCoord receiver_core_0 = CoreCoord(9, 0);
-    CoreCoord receiver_core_1 = CoreCoord(1, 0);
-
-    ASSERT_TRUE(
-        unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE));
-    ASSERT_TRUE(
-        unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE));
-    ASSERT_TRUE(
-        unit_tests::erisc::direct_send::send_over_eth(device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE));
-    ASSERT_TRUE(
-        unit_tests::erisc::direct_send::send_over_eth(device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 256));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * 256));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 256));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * 256));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * 1024));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * 1024));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * 1024));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * 1024));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_0, receiver_core_0, WORD_SIZE * MAX_NUM_WORDS));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, receiver_core_0, sender_core_0, WORD_SIZE * MAX_NUM_WORDS));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_0, device_1, sender_core_1, receiver_core_1, WORD_SIZE * MAX_NUM_WORDS));
-    ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-        device_1, device_0, receiver_core_1, sender_core_1, WORD_SIZE * MAX_NUM_WORDS));
-}
-
-TEST_F(N300DeviceFixture, RandomDirectSendTests) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    GTEST_SKIP();
-    srand(0);
-    ASSERT_TRUE(this->num_devices_ == 2);
-
-    std::map<std::pair<int, CoreCoord>, std::pair<int, CoreCoord>> connectivity = {
-        {{0, CoreCoord(9, 6)}, {1, CoreCoord(9, 0)}},
-        {{1, CoreCoord(9, 0)}, {0, CoreCoord(9, 6)}},
-        {{0, CoreCoord(1, 6)}, {1, CoreCoord(1, 0)}},
-        {{1, CoreCoord(1, 0)}, {0, CoreCoord(1, 6)}}};
-    for (int i = 0; i < 1000; i++) {
-        auto it = connectivity.begin();
-        std::advance(it, rand() % (connectivity.size()));
-
-        const auto& send_chip = devices_.at(std::get<0>(it->first));
-        CoreCoord sender_core = std::get<1>(it->first);
-        const auto& receiver_chip = devices_.at(std::get<0>(it->second));
-        CoreCoord receiver_core = std::get<1>(it->second);
-        int num_words = 0;
-        if constexpr (MAX_NUM_WORDS != 0) {
-            num_words = rand() % MAX_NUM_WORDS + 1;
-        }
-
-        ASSERT_TRUE(unit_tests::erisc::direct_send::send_over_eth(
-            send_chip, receiver_chip, sender_core, receiver_core, WORD_SIZE * num_words));
-    }
-}
diff --git a/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp b/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp
deleted file mode 100644
index f70039820a5..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/fast_dispatch_kernels/test_write_host.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <memory>
-
-#include "gtest/gtest.h"
-#include "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h"
-#include "tests/tt_metal/tt_metal/unit_tests/common/device_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/common/math.hpp"
-
-using std::vector;
-using namespace tt::tt_metal;
-
-// TODO: Remove dependency on "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h" and remove globals
-bool debug_g = false;
-// Page size 4096 bytes
-uint32_t log_dispatch_buffer_page_size_g = 12;
-uint32_t dispatch_buffer_page_size_g = 1 << log_dispatch_buffer_page_size_g;
-bool use_coherent_data_g = false;
-uint32_t hugepage_buffer_size_g = 256 * 1024 * 1024;
-uint32_t dev_hugepage_base = dispatch_buffer_page_size_g;
-std::pair<uint32_t, uint32_t> default_ptrs = std::make_pair(dev_hugepage_base, 0);
-uint32_t hugepage_issue_buffer_size_g;
-
-inline void gen_dispatcher_pad_to_page(vector<uint32_t>& cmds, uint32_t page_size) {
-    uint32_t num_words_in_page = page_size / sizeof(uint32_t);
-    uint32_t num_pad_words = tt::round_up(cmds.size(), num_words_in_page) - cmds.size();
-    for (uint32_t i = 0; i < num_pad_words; ++i) {
-        cmds.push_back(0);
-    }
-}
-
-inline bool validate_results(
-    std::vector<uint32_t>& dev_data,
-    uint32_t num_words,
-    void *host_hugepage_base,
-    uint32_t dev_hugepage_base,
-    uint32_t dev_hugepage_start,
-    uint32_t hugepage_buffer_size_g) {
-    bool failed = false;
-
-    log_info(tt::LogTest, "Validating {} bytes from hugepage", num_words * sizeof(uint32_t));
-
-    uint32_t *results = ((uint32_t *)host_hugepage_base);  // 8 = 32B / sizeof(uint32_t)
-    uint32_t dev_hugepage_start_diff_uint = (dev_hugepage_start - dev_hugepage_base) / sizeof(uint32_t);
-    uint32_t hugepage_buffer_size_g_uint = hugepage_buffer_size_g / sizeof(uint32_t);
-    int fail_count = 0;
-
-    for (int i = 0; i < num_words; ++i) {
-        uint32_t hugepage_idx = (dev_hugepage_start_diff_uint + i) % hugepage_buffer_size_g_uint;
-        if (results[hugepage_idx] != dev_data[i]) {
-            if (!failed) {
-                tt::log_fatal("Data mismatch");
-                fprintf(stderr, "First 20 failures for each core: [idx] expected->read\n");
-            }
-            if (fail_count == 0) {
-                fprintf(stderr, "Failures reading hugepage\n");
-            }
-
-            fprintf(stderr, "  [%02d] 0x%08x->0x%08x\n", i, (unsigned int)dev_data[i], (unsigned int)results[hugepage_idx]);
-
-            failed = true;
-            fail_count++;
-            if (fail_count > 20) {
-                break;
-            }
-        }
-    }
-
-    return !failed;
-}
-
-namespace local_test_functions {
-
-bool test_write_host(Device *device, uint32_t data_size, std::pair<uint32_t, uint32_t> write_ptr_start = default_ptrs, std::pair<uint32_t, uint32_t> read_ptr_start = default_ptrs, std::optional<std::pair<uint32_t, uint32_t>> read_ptr_update = std::nullopt) {
-    CoreCoord spoof_prefetch_core = {0, 0};
-    CoreCoord dispatch_core = {4, 0};
-    CoreCoord phys_spoof_prefetch_core = device->worker_core_from_logical_core(spoof_prefetch_core);
-    CoreCoord phys_dispatch_core = device->worker_core_from_logical_core(dispatch_core);
-
-    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
-
-    uint32_t dispatch_buffer_size_blocks_g = 4;
-
-    uint32_t total_size = data_size + sizeof(CQDispatchCmd);
-
-    // NOTE: this test hijacks hugepage
-    // Start after first page since ptrs are at the start of hugepage
-
-    void *host_hugepage_base;
-    chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device->id());
-    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
-    host_hugepage_base = (void *)tt::Cluster::instance().host_dma_address(0, mmio_device_id, channel);
-    host_hugepage_base = (void *)((uint8_t *)host_hugepage_base + dev_hugepage_base);
-
-    uint32_t l1_unreserved_base = devices_.at(id)->get_base_allocator_addr(HalMemType::L1);
-    uint32_t l1_buf_base = align(l1_unreserved_base, dispatch_buffer_page_size_g);
-
-    std::vector<uint32_t> dispatch_cmds;
-    CQDispatchCmd cmd;
-    memset(&cmd, 0, sizeof(CQDispatchCmd));
-    cmd.base.cmd_id = CQ_DISPATCH_CMD_WRITE_LINEAR_H_HOST;
-    cmd.write_linear_host.length = data_size + sizeof(CQDispatchCmd);
-    add_dispatcher_cmd(dispatch_cmds, cmd, data_size);
-    gen_dispatcher_pad_to_page(dispatch_cmds, dispatch_buffer_page_size_g);
-    uint32_t dev_output_num_words = total_size / sizeof(uint32_t);
-    gen_dispatcher_terminate_cmd(dispatch_cmds);
-
-    uint32_t cmd_cb_pages = tt::div_up(dispatch_cmds.size() * sizeof(uint32_t), dispatch_buffer_page_size_g);
-
-    // Make full blocks
-    uint32_t dispatch_buffer_pages = tt::round_up(cmd_cb_pages, dispatch_buffer_size_blocks_g);
-    uint32_t dispatch_buffer_size_g = dispatch_buffer_pages * dispatch_buffer_page_size_g;
-    TT_FATAL(l1_buf_base + dispatch_buffer_size_g <= device->l1_size_per_core(), "Does not fit in L1");
-
-    std::vector<uint32_t> write_ptr_val = {(write_ptr_start.first >> 4) | (write_ptr_start.second << 31)};
-    std::vector<uint32_t> read_ptr_val = {(read_ptr_start.first >> 4) | (read_ptr_start.second << 31)};
-
-    uint32_t completion_q_wr_ptr = dispatch_constants::get(CoreType::WORKER).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
-    uint32_t completion_q_rd_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
-    // Write the read and write ptrs
-    tt::llrt::write_hex_vec_to_core(
-        device->id(), phys_dispatch_core, write_ptr_val, completion_q_wr_ptr);
-    tt::llrt::write_hex_vec_to_core(
-        device->id(), phys_dispatch_core, read_ptr_val, completion_q_rd_ptr);
-
-    tt::llrt::write_hex_vec_to_core(device->id(), phys_spoof_prefetch_core, dispatch_cmds, l1_buf_base);
-    tt::Cluster::instance().l1_barrier(device->id());
-
-    const uint32_t spoof_prefetch_core_sem_0_id =
-        tt::tt_metal::CreateSemaphore(program, {spoof_prefetch_core}, dispatch_buffer_pages);
-    const uint32_t dispatch_core_sem_id = tt::tt_metal::CreateSemaphore(program, {dispatch_core}, 0);
-    TT_ASSERT(spoof_prefetch_core_sem_0_id == dispatch_core_sem_id);
-    const uint32_t dispatch_cb_sem = spoof_prefetch_core_sem_0_id;
-
-    const uint32_t spoof_prefetch_core_sem_1_id = tt::tt_metal::CreateSemaphore(program, {spoof_prefetch_core}, 0);
-    const uint32_t prefetch_sync_sem = spoof_prefetch_core_sem_1_id;
-
-    std::vector<uint32_t> dispatch_compile_args = {
-        l1_buf_base,
-        log_dispatch_buffer_page_size_g,
-        dispatch_buffer_pages,
-        dispatch_cb_sem,
-        dispatch_cb_sem, // ugly, share an address
-        dispatch_buffer_size_blocks_g,
-        prefetch_sync_sem,
-        default_ptrs.second,
-        dev_hugepage_base,
-        hugepage_buffer_size_g,
-        0,    // unused downstream_cb_base
-        0,    // unused downstream_cb_size
-        0,    // unused my_downstream_cb_sem_id
-        0,    // unused downstream_cb_sem_id
-        0,    // unused split_dispatch_page_preamble_size
-        true,
-        true};
-    std::vector<uint32_t> spoof_prefetch_compile_args = {
-        l1_buf_base,
-        log_dispatch_buffer_page_size_g,
-        dispatch_buffer_pages,
-        dispatch_cb_sem,
-        l1_buf_base,
-        cmd_cb_pages,
-        // Hardcode page_batch_size to 1 to force the inner loops to only run once
-        1,
-        prefetch_sync_sem,
-        };
-
-    std::map<string, string> prefetch_defines = {
-        {"MY_NOC_X", std::to_string(phys_spoof_prefetch_core.x)},
-        {"MY_NOC_Y", std::to_string(phys_spoof_prefetch_core.y)},
-        {"DISPATCH_NOC_X", std::to_string(phys_dispatch_core.x)},
-        {"DISPATCH_NOC_Y", std::to_string(phys_dispatch_core.y)},
-    };
-
-    auto sp1 = tt::tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/spoof_prefetch.cpp",
-        {spoof_prefetch_core},
-        tt::tt_metal::DataMovementConfig{
-            .processor = tt::tt_metal::DataMovementProcessor::RISCV_1,
-            .noc = tt::tt_metal::NOC::RISCV_0_default,
-            .compile_args = spoof_prefetch_compile_args,
-            .defines = prefetch_defines});
-
-    // Hardcode outer loop to 1
-    vector<uint32_t> args = {1};
-    tt::tt_metal::SetRuntimeArgs(program, sp1, spoof_prefetch_core, args);
-
-    constexpr NOC my_noc_index = NOC::NOC_0;
-    constexpr NOC dispatch_upstream_noc_index = NOC::NOC_1;
-
-    configure_kernel_variant<true, true>(program,
-        "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp",
-        dispatch_compile_args,
-        dispatch_core,
-        phys_dispatch_core,
-        phys_spoof_prefetch_core,
-        {0, 0},
-        device,
-        my_noc_index,
-        my_noc_index,
-        my_noc_index);
-
-    // Need a separate thread for SD
-    if (read_ptr_update.has_value()) {
-        std::thread t1 ([&]() {
-            uint64_t run_mailbox_address = GET_MAILBOX_ADDRESS_HOST(launch.run);
-            std::vector<uint32_t> run_mailbox_read_val;
-            uint8_t run;
-            do {
-                run_mailbox_read_val = tt::llrt::read_hex_vec_from_core(device->id(), phys_dispatch_core, run_mailbox_address & ~0x3, sizeof(uint32_t));
-                run = run_mailbox_read_val[0] >> (8 * (offsetof(launch_msg_t, run) & 3));
-            } while (run != RUN_MSG_GO);
-            sleep(1);
-            std::vector<uint32_t> read_ptr_update_val = {(read_ptr_update.value().first >> 4) | (read_ptr_update.value().second << 31)};
-            tt::llrt::write_hex_vec_to_core(
-                device->id(), phys_dispatch_core, read_ptr_update_val, completion_q_rd_ptr);
-        });
-        tt::tt_metal::detail::LaunchProgram(device, program);
-        t1.join();
-    } else {
-        tt::tt_metal::detail::LaunchProgram(device, program);
-    }
-
-    // Validation
-    bool pass = validate_results(
-        dispatch_cmds, dev_output_num_words, host_hugepage_base, dev_hugepage_base, write_ptr_start.first, hugepage_buffer_size_g);
-    return pass;
-}
-
-}  // end namespace local_test_functions
-
-namespace basic_tests {
-
-TEST_F(DeviceSingleCardFixture, TestWriteHostBasic) {
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, dispatch_buffer_page_size_g - sizeof(CQDispatchCmd)));
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, dispatch_buffer_page_size_g));
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 256));
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 3 * dispatch_buffer_page_size_g));
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g));
-}
-
-TEST_F(DeviceSingleCardFixture, TestWriteHostWrap) {
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {hugepage_buffer_size_g - 1 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}, {hugepage_buffer_size_g - 1 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}));
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {hugepage_buffer_size_g - 2 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}, {hugepage_buffer_size_g - 2 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}));
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {hugepage_buffer_size_g - 3 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}, {hugepage_buffer_size_g - 3 * dispatch_buffer_page_size_g + dev_hugepage_base, 0}));
-}
-
-TEST_F(DeviceSingleCardFixture, TestWriteHostStall) {
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {dev_hugepage_base, 1}, {dev_hugepage_base, 0}, std::make_pair(dev_hugepage_base + 11 * dispatch_buffer_page_size_g, 0)));
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {dev_hugepage_base, 1}, {dev_hugepage_base + 5 * dispatch_buffer_page_size_g, 0}, std::make_pair(dev_hugepage_base + 11 * dispatch_buffer_page_size_g, 0)));
-    EXPECT_TRUE(local_test_functions::test_write_host(device_, 10 * dispatch_buffer_page_size_g, {dev_hugepage_base + 3 * dispatch_buffer_page_size_g, 1}, {dev_hugepage_base + 3 * dispatch_buffer_page_size_g, 0}, std::make_pair(dev_hugepage_base + 3 * dispatch_buffer_page_size_g, 1)));
-}
-
-}  // namespace basic_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests/tests_main.cpp b/tests/tt_metal/tt_metal/unit_tests/tests_main.cpp
deleted file mode 100644
index 1e42f41a46c..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests/tests_main.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "gtest/gtest.h"
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_kernel_creation.cpp b/tests/tt_metal/tt_metal/unit_tests_common/basic/test_kernel_creation.cpp
deleted file mode 100644
index 68dc974545a..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_common/basic/test_kernel_creation.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
-#include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/common/logger.hpp"
-
-
-using namespace tt;
-
-// Ensures we can successfully create kernels on available compute grid
-TEST_F(CommonFixture, CreateKernelsOnComputeCores) {
-    for (unsigned int id = 0; id < devices_.size(); id++) {
-        tt_metal::Program program = CreateProgram();
-        CoreCoord compute_grid = devices_.at(id)->compute_with_storage_grid_size();
-        EXPECT_NO_THROW(
-            auto test_kernel = tt_metal::CreateKernel(
-                program,
-                "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
-                CoreRange(CoreCoord(0, 0), CoreCoord(compute_grid.x, compute_grid.y)),
-                {.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}
-            );
-        );
-    }
-}
-
-// Ensure we cannot create kernels on storage cores
-TEST_F(CommonFixture, CreateKernelsOnStorageCores) {
-    for (unsigned int id=0; id < devices_.size(); id++) {
-        if (devices_.at(id)->storage_only_cores().empty()) {
-            GTEST_SKIP() << "This test only runs on devices with storage only cores";
-        }
-        CoreRangeSet storage_core_range_set = CoreRangeSet(devices_.at(id)->storage_only_cores());
-        EXPECT_ANY_THROW(
-            auto test_kernel = tt_metal::CreateKernel(
-                program,
-                "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
-                storage_core_range_set,
-                {.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}
-            );
-        );
-    }
-}
-
-TEST_F(CommonFixture, CreateKernelsOnDispatchCores) {
-    if (getenv("TT_METAL_SLOW_DISPATCH_MODE")) {
-        GTEST_SKIP() << "This test is only supported in fast dispatch mode";
-    }
-    for (unsigned int id=0; id < devices_.size(); id++) {
-        std::vector<CoreCoord> dispatch_cores = tt::get_logical_dispatch_cores(device->id(), device->num_hw_cqs());
-        CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
-        std::set<CoreCoord> dispatch_core_range_set(dispatch_cores.begin(), dispatch_cores.end());
-
-        if (dispatch_core_type == CoreType::WORKER) {
-            EXPECT_ANY_THROW(
-                auto test_kernel = tt_metal::CreateKernel(
-                    program,
-                    "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
-                    dispatch_core_range_set,
-                    {.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}
-                );
-            );
-        } else if (dispatch_core_type == CoreType::ETH) {
-            EXPECT_ANY_THROW(
-                auto test_kernel = tt_metal::CreateKernel(
-                    program,
-                    "tests/tt_metal/tt_metal/test_kernels/misc/erisc_print.cpp",
-                    dispatch_core_range_set,
-                    {.noc = tt_metal::NOC::NOC_0, .eth_mode = Eth::IDLE}
-                );
-            );
-        }
-    }
-}
diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp
deleted file mode 100644
index 829a9feb140..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_common/common/dprint_fixture.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "common_fixture.hpp"
-#include "impl/debug/dprint_server.hpp"
-#include "tt_metal/common/core_descriptor.hpp"
-
-// A version of CommonFixture with DPrint enabled on all cores.
-class DPrintFixture: public CommonFixture {
-public:
-    inline static const string dprint_file_name = "gtest_dprint_log.txt";
-
-    // A function to run a program, according to which dispatch mode is set.
-    void RunProgram(Device* device, Program& program) {
-        // Only difference is that we need to wait for the print server to catch
-        // up after running a test.
-        CommonFixture::RunProgram(device, program);
-        tt::DprintServerAwait();
-    }
-
-protected:
-    // Running with dprint + watcher enabled can make the code size blow up, so let's force watcher
-    // disabled for DPRINT tests.
-    bool watcher_previous_enabled;
-    void SetUp() override {
-        // The core range (physical) needs to be set >= the set of all cores
-        // used by all tests using this fixture, so set dprint enabled for
-        // all cores and all devices
-        tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, true);
-        tt::llrt::OptionsG.set_feature_all_cores(
-            tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassWorker);
-        tt::llrt::OptionsG.set_feature_all_cores(
-            tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassWorker);
-        tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, true);
-        // Send output to a file so the test can check after program is run.
-        tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, dprint_file_name);
-        tt::llrt::OptionsG.set_test_mode_enabled(true);
-        watcher_previous_enabled = tt::llrt::OptionsG.get_watcher_enabled();
-        tt::llrt::OptionsG.set_watcher_enabled(false);
-
-        ExtraSetUp();
-
-        // Parent class initializes devices and any necessary flags
-        CommonFixture::SetUp();
-    }
-
-    void TearDown() override {
-        // Parent class tears down devices
-        CommonFixture::TearDown();
-
-        // Remove the DPrint output file after the test is finished.
-        std::remove(dprint_file_name.c_str());
-
-        // Reset DPrint settings
-        tt::llrt::OptionsG.set_feature_cores(tt::llrt::RunTimeDebugFeatureDprint, {});
-        tt::llrt::OptionsG.set_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint, false);
-        tt::llrt::OptionsG.set_feature_all_cores(
-            tt::llrt::RunTimeDebugFeatureDprint, CoreType::WORKER, tt::llrt::RunTimeDebugClassNoneSpecified);
-        tt::llrt::OptionsG.set_feature_all_cores(
-            tt::llrt::RunTimeDebugFeatureDprint, CoreType::ETH, tt::llrt::RunTimeDebugClassNoneSpecified);
-        tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false);
-        tt::llrt::OptionsG.set_feature_file_name(tt::llrt::RunTimeDebugFeatureDprint, "");
-        tt::llrt::OptionsG.set_test_mode_enabled(false);
-        tt::llrt::OptionsG.set_watcher_enabled(watcher_previous_enabled);
-    }
-
-    void RunTestOnDevice(
-        const std::function<void(DPrintFixture*, Device*)>& run_function,
-        Device* device
-    ) {
-        auto run_function_no_args = [=]() {
-            run_function(this, device);
-        };
-        CommonFixture::RunTestOnDevice(run_function_no_args, device);
-        tt::DPrintServerClearLogFile();
-        tt::DPrintServerClearSignals();
-    }
-
-    // Override this function in child classes for additional setup commands between DPRINT setup
-    // and device creation.
-    virtual void ExtraSetUp() {}
-};
-
-// For usage by tests that need the dprint server devices disabled.
-class DPrintFixtureDisableDevices: public DPrintFixture {
-protected:
-    void ExtraSetUp() override {
-        // For this test, mute each devices using the environment variable
-        tt::llrt::OptionsG.set_feature_all_chips(tt::llrt::RunTimeDebugFeatureDprint, false);
-        tt::llrt::OptionsG.set_feature_chip_ids(tt::llrt::RunTimeDebugFeatureDprint, {});
-    }
-};
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt
deleted file mode 100644
index 75cc62aeabb..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-set(UNIT_TESTS_FD_SRC
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_CommandQueue.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueProgram.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueTrace.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_events.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_HostAsyncCQ.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_worker_config_buffer.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/compute/sfpu/sfpu_compute.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/multichip/test_device_pool.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/multichip/test_eth_EnqueueProgram.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/multichip/test_eth_ring_gather_EnqueueProgram.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/pipelining/basic_pipeline.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/streams/test_autonomous_relay_streams.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/sub_device/test_sub_device.cpp
-)
-
-add_executable(
-    unit_tests_fast_dispatch
-    ${UNIT_TESTS_FD_SRC}
-    $<TARGET_OBJECTS:unit_tests_common_o>
-)
-TT_ENABLE_UNITY_BUILD(unit_tests_fast_dispatch)
-
-target_link_libraries(unit_tests_fast_dispatch PUBLIC test_metal_common_libs)
-target_include_directories(
-    unit_tests_fast_dispatch
-    PRIVATE
-        ${PROJECT_SOURCE_DIR}
-        ${PROJECT_SOURCE_DIR}/tt_metal
-        ${PROJECT_SOURCE_DIR}/tests
-        ${CMAKE_CURRENT_SOURCE_DIR}
-        ${CMAKE_CURRENT_SOURCE_DIR}/common
-)
-set_target_properties(
-    unit_tests_fast_dispatch
-    PROPERTIES
-        RUNTIME_OUTPUT_DIRECTORY
-            ${PROJECT_BINARY_DIR}/test/tt_metal
-)
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/README.md b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/README.md
deleted file mode 100644
index 55aab607296..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Summary
-Unit testing uses the doctest framework.  See https://github.com/doctest/doctest/
-Generally, there are three main levels of organization:
-*  TEST_SUITE - Used to group main areas of tests
-*  TEST_CASE - How Test case and sub-case gets split up is at test-writer discretion, but see the test_case section
-*  SUB_CASE
-
-
-## Build && Execution
-### Build
-`make tests/tt_metal/unit_tests`
-### Get Help
-`./build/test/tt_metal/unit_tests --help`
-### Execute all tests
-`./build/test/tt_metal/unit_tests`
-### Execute filtered test-suite
-`./build/test/tt_metal/unit_tests -ts="*Sfpu*"`
-### List all test-suite with filter
-`./build/test/tt_metal/unit_tests -ts="*Sfpu*" -lts`
-
-## Folder Structure
-General structure of the tests are as follows, more sub-folders can be added
-<table><tr><td>
-Directory Structure - Please add any new-tests to a corresponding folder.
-</td></tr><td>
-<pre lang="">
-tt_metal/unit_tests/
-&nbsp;&nbsp;> test_main.cpp
-&nbsp;&nbsp;> basic/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Any basic test files can exist here, will be automatically added to test_main
-&nbsp;&nbsp;> common/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Used to hold any common structures across all test suites like fixtures
-&nbsp;&nbsp;> dram/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Any dram unit/stress test files can exist here, will be automatically added to test_main
-&nbsp;&nbsp;> compute/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Any basic test files can exist here, will be automatically added to test_main
-&nbsp;&nbsp;> new_folders/
-&nbsp;&nbsp;&nbsp;&nbsp;> # Any test files can exist here, will be automatically added to test_main
-test_utils/
-&nbsp;&nbsp;> comparison.cpp # Useful utils for comparing, see example usages in unit tests
-&nbsp;&nbsp;> print_helpers.cpp # Useful utils for printin
-&nbsp;&nbsp;> stimulus.cpp # Useful utils for generating random vectors or specific vectors, see example usages in unit tests
-&nbsp;&nbsp;> tilization.cpp # Useful utils for converting between tiled vectors or not, see example usages in unit tests
-</td></tr></table>
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
deleted file mode 100644
index 7aa1811ecd5..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-
-struct TestBufferConfig {
-    uint32_t num_pages;
-    uint32_t page_size;
-    tt::tt_metal::BufferType buftype;
-};
-
-inline std::pair<std::shared_ptr<tt::tt_metal::Buffer>, std::vector<uint32_t>> EnqueueWriteBuffer_prior_to_wrap(tt::tt_metal::Device* device, tt::tt_metal::CommandQueue& cq, const TestBufferConfig& config) {
-    // This function just enqueues a buffer (which should be large in the config)
-    // write as a precursor to testing the wrap mechanism
-    size_t buf_size = config.num_pages * config.page_size;
-    auto buffer = Buffer::create(device, buf_size, config.page_size, config.buftype);
-
-    std::vector<uint32_t> src = create_random_vector_of_bfloat16(
-      buf_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
-
-    EnqueueWriteBuffer(cq, *buffer, src, false);
-    return std::make_pair(std::move(buffer), src);
-}
-
-inline bool does_device_have_active_eth_cores(const Device *device) {
-    return !(device->get_active_ethernet_cores(true).empty());
-}
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp
deleted file mode 100644
index 8017f70fb27..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_CommandQueue.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <memory>
-
-#include "command_queue_fixture.hpp"
-#include "command_queue_test_utils.hpp"
-#include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-
-using namespace tt::tt_metal;
-
-namespace host_tests {
-
-namespace multi_device_tests {
-TEST_F(CommandQueueMultiDeviceFixture, DISABLED_TestAccessCommandQueue) {
-    for (unsigned int device_id = 0; device_id < num_devices_; device_id++) {
-        EXPECT_NO_THROW(devices_[device_id]->command_queue());
-    }
-}
-
-TEST(FastDispatchHostSuite, TestCannotAccessCommandQueueForClosedDevice) {
-    auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-    if (slow_dispatch) {
-        TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
-        GTEST_SKIP();
-    }
-    const unsigned int device_id = 0;
-    const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-    Device* device = tt::tt_metal::CreateDevice(device_id, tt::llrt::OptionsG.get_num_hw_cqs(), DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-
-    EXPECT_NO_THROW(device->command_queue());
-    CloseDevice(device);
-    EXPECT_ANY_THROW(device->command_queue());
-}
-
-TEST_F(CommandQueueMultiDeviceFixture, DISABLED_TestDirectedLoopbackToUniqueHugepage) {
-    std::unordered_map<chip_id_t, std::vector<uint32_t>> golden_data;
-
-    const uint32_t byte_size = 2048 * 16;
-    const uint64_t address = 0;
-
-    for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) {
-        std::vector<uint32_t> data =
-            tt::test_utils::generate_uniform_random_vector<uint32_t>(0, UINT32_MAX, byte_size / sizeof(uint32_t));
-
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
-        tt::Cluster::instance().write_sysmem(data.data(), data.size() * sizeof(uint32_t), address, mmio_device_id, channel);
-
-        golden_data[device_id] = data;
-    }
-
-    std::vector<uint32_t> readback_data;
-    readback_data.resize(byte_size / sizeof(uint32_t));
-    for (chip_id_t device_id = 0; device_id < num_devices_; device_id++) {
-        chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
-        uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
-        tt::Cluster::instance().read_sysmem(readback_data.data(), byte_size, address, mmio_device_id, channel);
-        EXPECT_EQ(readback_data, golden_data.at(device_id));
-    }
-}
-}
-
-
-
-
-}   // namespace host_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
index e4eceaffb9c..6ebe8f64da2 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp
@@ -92,13 +92,13 @@ bool flatten(Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5)
                 .page_size = dram_buffer_size,
                 .buffer_type = BufferType::DRAM
                 };
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 8;
     CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t ouput_cb_index = 16;
+    uint32_t ouput_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 1;
     CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
         .set_page_size(ouput_cb_index, single_tile_size);
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_device_pool.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_device_pool.cpp
deleted file mode 100644
index 85af4f9396f..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_device_pool.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-
-#include "tests/tt_metal/tt_metal/unit_tests/common/basic_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
-#include "tt_metal/impl/device/device_pool.hpp"
-
-using namespace tt;
-using namespace tt::test_utils;
-
-TEST_F(FDBasicFixture, DevicePoolOpenClose) {
-    std::vector<chip_id_t> device_ids{0};
-    int num_hw_cqs = 1;
-    int l1_small_size = 1024;
-    const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-    tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    auto devices = tt::DevicePool::instance().get_all_active_devices();
-    for (const auto& dev: devices) {
-      ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
-      ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
-      ASSERT_TRUE(dev->is_initialized());
-    }
-
-    // Close then get devices again
-    for (const auto& dev: devices) {
-        dev->close();
-    }
-    devices = tt::DevicePool::instance().get_all_active_devices();
-    for (const auto& dev: devices) {
-      ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
-      ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
-      ASSERT_TRUE(dev->is_initialized());
-    }
-    for (const auto& dev: devices) {
-        dev->close();
-    }
-}
-
-TEST_F(FDBasicFixture, DevicePoolReconfigDevices) {
-    std::vector<chip_id_t> device_ids{0};
-    int num_hw_cqs = 1;
-    int l1_small_size = 1024;
-    const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-    tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    auto devices = tt::DevicePool::instance().get_all_active_devices();
-    for (const auto& dev: devices) {
-      ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
-      ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
-      ASSERT_TRUE(dev->is_initialized());
-    }
-
-    // Close then get devices with different configs
-    for (const auto& dev: devices) {
-        dev->close();
-    }
-    l1_small_size = 2048;
-    tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    devices = tt::DevicePool::instance().get_all_active_devices();
-    for (const auto& dev: devices) {
-      ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
-      ASSERT_TRUE(dev->is_initialized());
-    }
-    for (const auto& dev: devices) {
-        dev->close();
-    }
-}
-
-TEST_F(FDBasicFixture, DevicePoolAddDevices) {
-    if (tt::tt_metal::GetNumAvailableDevices() != 8) {
-        GTEST_SKIP();
-    }
-    std::vector<chip_id_t> device_ids{0};
-    int num_hw_cqs = 1;
-    int l1_small_size = 1024;
-    const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-    tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    auto devices = tt::DevicePool::instance().get_all_active_devices();
-    for (const auto& dev: devices) {
-      ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
-      ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
-      ASSERT_TRUE(dev->is_initialized());
-    }
-
-    // Close then get more devices
-    for (const auto& dev: devices) {
-        dev->close();
-    }
-    device_ids = {0, 1, 2, 3};
-    tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    devices = tt::DevicePool::instance().get_all_active_devices();
-    ASSERT_TRUE(devices.size() >= 4);
-    for (const auto& dev: devices) {
-      ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
-      ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
-      ASSERT_TRUE(dev->is_initialized());
-    }
-    for (const auto& dev: devices) {
-        dev->close();
-    }
-}
-
-TEST_F(FDBasicFixture, DevicePoolReduceDevices) {
-    if (tt::tt_metal::GetNumAvailableDevices() != 8) {
-        GTEST_SKIP();
-    }
-    std::vector<chip_id_t> device_ids{0, 1, 2, 3};
-    int num_hw_cqs = 1;
-    int l1_small_size = 1024;
-    const auto &dispatch_core_type = tt::llrt::OptionsG.get_dispatch_core_type();
-    tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    const auto devices = tt::DevicePool::instance().get_all_active_devices();
-    for (const auto& dev: devices) {
-      ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
-      ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
-      ASSERT_TRUE(dev->is_initialized());
-    }
-
-    // Close then get less devices
-    for (const auto& dev: devices) {
-        dev->close();
-    }
-    device_ids = {0};
-    tt::DevicePool::initialize(device_ids, num_hw_cqs, l1_small_size, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    auto dev = tt::DevicePool::instance().get_active_device(0);
-    ASSERT_TRUE(dev->id() == 0);
-    ASSERT_TRUE((int)(dev->get_l1_small_size()) == l1_small_size);
-    ASSERT_TRUE((int)(dev->num_hw_cqs()) == num_hw_cqs);
-    ASSERT_TRUE(dev->is_initialized());
-    tt::DevicePool::instance().close_device(0);
-}
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp
deleted file mode 100644
index ef05c731489..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_EnqueueProgram.cpp
+++ /dev/null
@@ -1,735 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "command_queue_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/impl/device/device.hpp"
-
-#include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
-
-
-using std::vector;
-using namespace tt;
-using namespace tt::test_utils;
-using namespace tt::test_utils::df;
-
-namespace {
-namespace CMAKE_UNIQUE_NAMESPACE {
-constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
-constexpr std::int32_t MAX_NUM_WORDS =
-    (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE;
-constexpr std::int32_t MAX_BUFFER_SIZE =
-    (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
-
-struct BankedConfig {
-    size_t num_pages = 1;
-    size_t size_bytes = 1 * 2 * 32 * 32;
-    size_t page_size_bytes = 2 * 32 * 32;
-    tt_metal::BufferType input_buffer_type = tt_metal::BufferType::L1;
-    tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1;
-    tt::DataFormat l1_data_format = tt::DataFormat::Float16_b;
-};
-}
-}
-
-namespace fd_unit_tests::erisc::kernels {
-
-const size_t get_rand_32_byte_aligned_address(const size_t& base, const size_t& max) {
-    TT_ASSERT(!(base & 0x1F) and !(max & 0x1F));
-    size_t word_size = (max >> 5) - (base >> 5);
-    return (((rand() % word_size) << 5) + base);
-}
-
-bool test_dummy_EnqueueProgram_with_runtime_args(Device* device, const CoreCoord& eth_core_coord) {
-    Program program;
-    bool pass = true;
-    auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_core_coord);
-
-    constexpr uint32_t num_runtime_args0 = 9;
-    constexpr uint32_t rta_base0 = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    std::map<string, string> dummy_defines0 = {{"DATA_MOVEMENT", "1"},
-                                               {"NUM_RUNTIME_ARGS", std::to_string(num_runtime_args0)},
-                                               {"RESULTS_ADDR", std::to_string(rta_base0)}};
-    auto dummy_kernel0 = CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/misc/runtime_args_kernel.cpp",
-        eth_core_coord,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .defines = dummy_defines0});
-
-    vector<uint32_t> dummy_kernel0_args = {0, 1, 2, 3, 4, 5, 6, 7, 8};
-    tt::tt_metal::SetRuntimeArgs(program, dummy_kernel0, eth_core_coord, dummy_kernel0_args);
-
-    tt::tt_metal::detail::CompileProgram(device, program);
-    auto& cq = device->command_queue();
-    EnqueueProgram(cq, program, false);
-    Finish(cq);
-
-    vector<uint32_t> dummy_kernel0_args_readback = llrt::read_hex_vec_from_core(
-        device->id(),
-        eth_noc_xy,
-        eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE,
-        dummy_kernel0_args.size() * sizeof(uint32_t));
-
-    pass &= (dummy_kernel0_args == dummy_kernel0_args_readback);
-
-    return pass;
-}
-
-bool reader_kernel_no_send(
-    tt_metal::Device* device,
-    const size_t& byte_size,
-    const size_t& eth_l1_byte_address,
-    const CoreCoord& eth_reader_core) {
-    bool pass = true;
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Application Setup
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program program = tt_metal::Program();
-
-    tt::tt_metal::InterleavedBufferConfig dram_config{
-        .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM};
-
-    auto input_dram_buffer = CreateBuffer(dram_config);
-    uint32_t dram_byte_address = input_dram_buffer->address();
-    auto dram_noc_xy = input_dram_buffer->noc_coordinates();
-    auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core);
-    log_debug(
-        tt::LogTest,
-        "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}",
-        device->id(),
-        byte_size,
-        dram_noc_xy.str(),
-        dram_byte_address,
-        eth_reader_core.str(),
-        eth_l1_byte_address);
-
-    auto eth_reader_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp",
-        eth_reader_core,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0});
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs);
-
-    // Clear expected value at ethernet L1 address
-    std::vector<uint32_t> all_zeros(inputs.size(), 0);
-    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, all_zeros, eth_l1_byte_address);
-
-    tt_metal::SetRuntimeArgs(
-        program,
-        eth_reader_kernel,
-        eth_reader_core,
-        {
-            (uint32_t)dram_byte_address,
-            (uint32_t)dram_noc_xy.x,
-            (uint32_t)dram_noc_xy.y,
-            (uint32_t)byte_size,
-            (uint32_t)eth_l1_byte_address,
-        });
-
-    auto& cq = device->command_queue();
-    tt::tt_metal::detail::CompileProgram(device, program);
-    EnqueueProgram(cq, program, false);
-    Finish(cq);
-
-    auto readback_vec = llrt::read_hex_vec_from_core(device->id(), eth_noc_xy, eth_l1_byte_address, byte_size);
-    pass &= (readback_vec == inputs);
-    if (not pass) {
-        std::cout << "Mismatch at Core: " << eth_noc_xy.str() << std::endl;
-    }
-    return pass;
-}
-
-bool writer_kernel_no_receive(
-    tt_metal::Device* device,
-    const size_t& byte_size,
-    const size_t& eth_l1_byte_address,
-    const CoreCoord& eth_writer_core) {
-    bool pass = true;
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Application Setup
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program program = tt_metal::Program();
-
-    tt::tt_metal::InterleavedBufferConfig dram_config{
-        .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM};
-
-    auto output_dram_buffer = CreateBuffer(dram_config);
-    uint32_t dram_byte_address = output_dram_buffer->address();
-    auto dram_noc_xy = output_dram_buffer->noc_coordinates();
-    auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core);
-    log_debug(
-        tt::LogTest,
-        "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}",
-        device->id(),
-        byte_size,
-        eth_writer_core.str(),
-        eth_l1_byte_address,
-        dram_noc_xy.str(),
-        dram_byte_address);
-
-    auto eth_writer_kernel = tt_metal::CreateKernel(
-        program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp",
-        eth_writer_core,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0});
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    llrt::write_hex_vec_to_core(device->id(), eth_noc_xy, inputs, eth_l1_byte_address);
-
-    // Clear expected value at ethernet L1 address
-    std::vector<uint32_t> all_zeros(inputs.size(), 0);
-    tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros);
-
-    tt_metal::SetRuntimeArgs(
-        program,
-        eth_writer_kernel,
-        eth_writer_core,
-        {
-            (uint32_t)dram_byte_address,
-            (uint32_t)dram_noc_xy.x,
-            (uint32_t)dram_noc_xy.y,
-            (uint32_t)byte_size,
-            (uint32_t)eth_l1_byte_address,
-        });
-
-    auto& cq = device->command_queue();
-    tt::tt_metal::detail::CompileProgram(device, program);
-    EnqueueProgram(cq, program, false);
-    Finish(cq);
-
-    auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size);
-    pass &= (readback_vec == inputs);
-    if (not pass) {
-        std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl;
-    }
-    return pass;
-}
-
-bool eth_direct_sender_receiver_kernels(
-    tt_metal::Device* sender_device,
-    tt_metal::Device* receiver_device,
-    const size_t& byte_size,
-    const size_t& src_eth_l1_byte_address,
-    const size_t& dst_eth_l1_byte_address,
-    const CoreCoord& eth_sender_core,
-    const CoreCoord& eth_receiver_core,
-    uint32_t num_bytes_per_send = 16) {
-    bool pass = true;
-    log_debug(
-        tt::LogTest,
-        "Sending {} bytes from device {} eth core {} addr {} to device {} eth core {} addr {}",
-        byte_size,
-        sender_device->id(),
-        eth_sender_core.str(),
-        src_eth_l1_byte_address,
-        receiver_device->id(),
-        eth_receiver_core.str(),
-        dst_eth_l1_byte_address);
-    // Generate inputs
-    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-    llrt::write_hex_vec_to_core(
-        sender_device->id(),
-        sender_device->ethernet_core_from_logical_core(eth_sender_core),
-        inputs,
-        src_eth_l1_byte_address);
-
-    // Clear expected value at ethernet L1 address
-    std::vector<uint32_t> all_zeros(inputs.size(), 0);
-    llrt::write_hex_vec_to_core(
-        receiver_device->id(),
-        receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
-        all_zeros,
-        dst_eth_l1_byte_address);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Sender Device
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program sender_program = tt_metal::Program();
-
-    auto eth_sender_kernel = tt_metal::CreateKernel(
-        sender_program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_send.cpp",
-        eth_sender_core,
-        tt_metal::EthernetConfig{
-            .noc = tt_metal::NOC::NOC_0,
-            .compile_args = {uint32_t(num_bytes_per_send), uint32_t(num_bytes_per_send >> 4)}});
-
-    tt_metal::SetRuntimeArgs(
-        sender_program,
-        eth_sender_kernel,
-        eth_sender_core,
-        {
-            (uint32_t)src_eth_l1_byte_address,
-            (uint32_t)dst_eth_l1_byte_address,
-            (uint32_t)byte_size,
-        });
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Receiver Device
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program receiver_program = tt_metal::Program();
-
-    auto eth_receiver_kernel = tt_metal::CreateKernel(
-        receiver_program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_receive.cpp",
-        eth_receiver_core,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0});  // probably want to use NOC_1 here
-
-    tt_metal::SetRuntimeArgs(
-        receiver_program,
-        eth_receiver_kernel,
-        eth_receiver_core,
-        {
-            (uint32_t)byte_size,
-        });
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    tt::tt_metal::detail::CompileProgram(sender_device, sender_program);
-    tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program);
-
-    EnqueueProgram(sender_device->command_queue(), sender_program, false);
-    EnqueueProgram(receiver_device->command_queue(), receiver_program, false);
-    Finish(sender_device->command_queue());
-    Finish(receiver_device->command_queue());
-
-    auto readback_vec = llrt::read_hex_vec_from_core(
-        receiver_device->id(),
-        receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
-        dst_eth_l1_byte_address,
-        byte_size);
-    pass &= (readback_vec == inputs);
-    if (not pass) {
-        std::cout << "Mismatch at Core: " << eth_receiver_core.str() << std::endl;
-        std::cout << readback_vec[0] << std::endl;
-    }
-    return pass;
-}
-
-bool chip_to_chip_dram_buffer_transfer(
-    tt_metal::Device* sender_device,
-    tt_metal::Device* receiver_device,
-    const CoreCoord& eth_sender_core,
-    const CoreCoord& eth_receiver_core,
-    const size_t& byte_size) {
-    bool pass = true;
-
-    tt::tt_metal::InterleavedBufferConfig sender_dram_config{
-        .device = sender_device,
-        .size = byte_size,
-        .page_size = byte_size,
-        .buffer_type = tt::tt_metal::BufferType::DRAM};
-    tt::tt_metal::InterleavedBufferConfig receiver_dram_config{
-        .device = receiver_device,
-        .size = byte_size,
-        .page_size = byte_size,
-        .buffer_type = tt::tt_metal::BufferType::DRAM};
-
-    // Create source buffer on sender device
-    auto input_dram_buffer = CreateBuffer(sender_dram_config);
-    uint32_t input_dram_byte_address = input_dram_buffer->address();
-    auto input_dram_noc_xy = input_dram_buffer->noc_coordinates();
-
-    // Create dest buffer on receiver device
-    auto output_dram_buffer = CreateBuffer(receiver_dram_config);
-    uint32_t output_dram_byte_address = output_dram_buffer->address();
-    auto output_dram_noc_xy = output_dram_buffer->noc_coordinates();
-
-    // Generate inputs
-    auto inputs = generate_uniform_random_vector<uint32_t>(0, 100, byte_size / sizeof(uint32_t));
-
-    tt_metal::detail::WriteToBuffer(input_dram_buffer, inputs);
-
-    const uint32_t MAX_BUFFER =
-        (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
-    uint32_t num_loops = (uint32_t)(byte_size / MAX_BUFFER);
-    uint32_t remaining_bytes = (uint32_t)(byte_size % MAX_BUFFER);
-    // Clear expected value at ethernet L1 address
-    std::vector<uint32_t> all_zeros(inputs.size(), 0);
-
-    tt_metal::detail::WriteToBuffer(output_dram_buffer, all_zeros);
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Sender Device
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program sender_program = tt_metal::Program();
-
-    auto eth_sender_kernel = tt_metal::CreateKernel(
-        sender_program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp",
-        eth_sender_core,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0});
-
-    tt_metal::SetRuntimeArgs(
-        sender_program,
-        eth_sender_kernel,
-        eth_sender_core,
-        {
-            (uint32_t)input_dram_byte_address,
-            (uint32_t)input_dram_noc_xy.x,
-            (uint32_t)input_dram_noc_xy.y,
-            (uint32_t)remaining_bytes,
-            (uint32_t)num_loops,
-            (uint32_t)MAX_BUFFER,
-        });
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Receiver Device
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program receiver_program = tt_metal::Program();
-
-    auto eth_receiver_kernel = tt_metal::CreateKernel(
-        receiver_program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp",
-        eth_receiver_core,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0});  // probably want to use NOC_1 here
-
-    tt_metal::SetRuntimeArgs(
-        receiver_program,
-        eth_receiver_kernel,
-        eth_receiver_core,
-        {
-            (uint32_t)output_dram_byte_address,
-            (uint32_t)output_dram_noc_xy.x,
-            (uint32_t)output_dram_noc_xy.y,
-            (uint32_t)remaining_bytes,
-            (uint32_t)num_loops,
-            (uint32_t)MAX_BUFFER,
-        });
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    tt::tt_metal::detail::CompileProgram(sender_device, sender_program);
-    tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program);
-
-    EnqueueProgram(sender_device->command_queue(), sender_program, false);
-    EnqueueProgram(receiver_device->command_queue(), receiver_program, false);
-    Finish(sender_device->command_queue());
-    Finish(receiver_device->command_queue());
-
-    std::vector<uint32_t> dest_dram_data;
-    tt_metal::detail::ReadFromBuffer(output_dram_buffer, dest_dram_data);
-    pass &= (dest_dram_data == inputs);
-    if (not pass) {
-        std::cout << "Mismatch at Core: " << output_dram_noc_xy.str() << std::endl;
-        std::cout << dest_dram_data[0] << std::endl;
-    }
-    return pass;
-}
-
-bool chip_to_chip_interleaved_buffer_transfer(
-    tt_metal::Device* sender_device,
-    tt_metal::Device* receiver_device,
-    const CoreCoord& eth_sender_core,
-    const CoreCoord& eth_receiver_core,
-    const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg,
-    const uint32_t& max_transfer_size) {
-    bool pass = true;
-
-    const uint32_t input0_cb_index = 0;
-    const uint32_t output_cb_index = 16;
-
-    TT_FATAL(cfg.num_pages * cfg.page_size_bytes == cfg.size_bytes, "Error");
-    constexpr uint32_t num_pages_cb = 1;
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Sender Device
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program sender_program = tt_metal::Program();
-
-    auto input_packed = generate_uniform_random_vector<uint32_t>(0, 100, cfg.size_bytes / sizeof(uint32_t));
-
-    tt::tt_metal::InterleavedBufferConfig sender_config{
-        .device = sender_device,
-        .size = cfg.size_bytes,
-        .page_size = cfg.page_size_bytes,
-        .buffer_type = cfg.input_buffer_type};
-    tt::tt_metal::InterleavedBufferConfig receiver_config{
-        .device = receiver_device,
-        .size = cfg.size_bytes,
-        .page_size = cfg.page_size_bytes,
-        .buffer_type = cfg.output_buffer_type};
-    auto input_buffer = CreateBuffer(sender_config);
-    bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM;
-
-    tt_metal::detail::WriteToBuffer(input_buffer, input_packed);
-
-    const uint32_t max_buffer = round_down(max_transfer_size, cfg.page_size_bytes);
-    uint32_t pages_per_loop = max_buffer / cfg.page_size_bytes;
-    uint32_t num_loops = (uint32_t)(cfg.size_bytes / max_buffer);
-    uint32_t remaining_bytes = (uint32_t)(cfg.size_bytes % max_buffer);
-    uint32_t remaining_pages = remaining_bytes / cfg.page_size_bytes;
-
-    auto eth_sender_kernel = tt_metal::CreateKernel(
-        sender_program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_buffer_to_buffer_sender.cpp",
-        eth_sender_core,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .compile_args = {(uint32_t)input_is_dram}});
-
-    tt_metal::SetRuntimeArgs(
-        sender_program,
-        eth_sender_kernel,
-        eth_sender_core,
-        {(uint32_t)input_buffer->address(),
-         (uint32_t)cfg.page_size_bytes,
-         (uint32_t)max_buffer,
-         (uint32_t)num_loops,
-         (uint32_t)pages_per_loop,
-         (uint32_t)remaining_bytes,
-         (uint32_t)remaining_pages});
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Receiver Device
-    ////////////////////////////////////////////////////////////////////////////
-    tt_metal::Program receiver_program = tt_metal::Program();
-
-    auto output_buffer = CreateBuffer(receiver_config);
-    bool output_is_dram = cfg.output_buffer_type == BufferType::DRAM;
-    std::vector<uint32_t> all_zeros(cfg.size_bytes / sizeof(uint32_t), 0);
-
-    tt_metal::detail::WriteToBuffer(output_buffer, all_zeros);
-
-    auto eth_receiver_kernel = tt_metal::CreateKernel(
-        receiver_program,
-        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_buffer_to_buffer_receiver.cpp",
-        eth_receiver_core,
-        tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_1, .compile_args = {(uint32_t)output_is_dram}});
-
-    tt_metal::SetRuntimeArgs(
-        receiver_program,
-        eth_receiver_kernel,
-        eth_receiver_core,
-        {
-            (uint32_t)output_buffer->address(),
-            (uint32_t)cfg.page_size_bytes,
-            (uint32_t)max_buffer,
-            (uint32_t)num_loops,
-            (uint32_t)pages_per_loop,
-            (uint32_t)remaining_bytes,
-            (uint32_t)remaining_pages,
-        });
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    tt::tt_metal::detail::CompileProgram(sender_device, sender_program);
-    tt::tt_metal::detail::CompileProgram(receiver_device, receiver_program);
-
-    EnqueueProgram(sender_device->command_queue(), sender_program, false);
-    EnqueueProgram(receiver_device->command_queue(), receiver_program, false);
-    Finish(sender_device->command_queue());
-    Finish(receiver_device->command_queue());
-
-    std::vector<uint32_t> dest_buffer_data;
-    tt_metal::detail::ReadFromBuffer(output_buffer, dest_buffer_data);
-    pass &= input_packed == dest_buffer_data;
-    return pass;
-}
-}  // namespace fd_unit_tests::erisc::kernels
-
-TEST_F(CommandQueueSingleCardFixture, EnqueueDummyProgramOnEthCore) {
-    for (const auto& device : devices_) {
-        for (const auto& eth_core : device->get_active_ethernet_cores(true)) {
-            ASSERT_TRUE(fd_unit_tests::erisc::kernels::test_dummy_EnqueueProgram_with_runtime_args(device, eth_core));
-        }
-    }
-}
-
-TEST_F(CommandQueueSingleCardFixture, EthKernelsNocReadNoSend) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-
-    for (const auto& device : devices_) {
-        for (const auto& eth_core : device->get_active_ethernet_cores(true)) {
-            ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send(
-                device, WORD_SIZE, src_eth_l1_byte_address, eth_core));
-            ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send(
-                device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
-            ASSERT_TRUE(fd_unit_tests::erisc::kernels::reader_kernel_no_send(
-                device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
-        }
-    }
-}
-
-TEST_F(CommandQueueSingleCardFixture, EthKernelsNocWriteNoReceive) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-
-    for (const auto& device : devices_) {
-        for (const auto& eth_core : device->get_active_ethernet_cores(true)) {
-            ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive(
-                device, WORD_SIZE, src_eth_l1_byte_address, eth_core));
-            ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive(
-                device, WORD_SIZE * 1024, src_eth_l1_byte_address, eth_core));
-            ASSERT_TRUE(fd_unit_tests::erisc::kernels::writer_kernel_no_receive(
-                device, WORD_SIZE * 2048, src_eth_l1_byte_address, eth_core));
-        }
-    }
-}
-
-TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectSendAllConnectedChips) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    for (const auto& sender_device : devices_) {
-        for (const auto& receiver_device : devices_) {
-            if (sender_device->id() >= receiver_device->id()) {
-                continue;
-            }
-            for (const auto& sender_core : sender_device->get_active_ethernet_cores(true)) {
-                auto [device_id, receiver_core] = sender_device->get_connected_ethernet_core(sender_core);
-                if (receiver_device->id() != device_id) {
-                    continue;
-                }
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                    sender_device,
-                    receiver_device,
-                    WORD_SIZE,
-                    src_eth_l1_byte_address,
-                    dst_eth_l1_byte_address,
-                    sender_core,
-                    receiver_core));
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                    sender_device,
-                    receiver_device,
-                    4 * WORD_SIZE,
-                    src_eth_l1_byte_address,
-                    dst_eth_l1_byte_address,
-                    sender_core,
-                    receiver_core));
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                    sender_device,
-                    receiver_device,
-                    256 * WORD_SIZE,
-                    src_eth_l1_byte_address,
-                    dst_eth_l1_byte_address,
-                    sender_core,
-                    receiver_core));
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_sender_receiver_kernels(
-                    sender_device,
-                    receiver_device,
-                    1000 * WORD_SIZE,
-                    src_eth_l1_byte_address,
-                    dst_eth_l1_byte_address,
-                    sender_core,
-                    receiver_core));
-            }
-        }
-    }
-}
-
-TEST_F(CommandQueueMultiDeviceFixture, EthKernelsSendDramBufferAllConnectedChips) {
-    for (const auto& sender_device : devices_) {
-        for (const auto& receiver_device : devices_) {
-            if (sender_device->id() >= receiver_device->id()) {
-                continue;
-            }
-            for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) {
-                auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core);
-                if (receiver_device->id() != device_id) {
-                    continue;
-                }
-                log_info(
-                    tt::LogTest,
-                    "Sending dram buffer from device {} to device {}, using eth core {} and {}",
-                    sender_device->id(),
-                    receiver_device->id(),
-                    sender_eth_core.str(),
-                    receiver_eth_core.str());
-
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-                    sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16));
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-                    sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1024));
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-                    sender_device, receiver_device, sender_eth_core, receiver_eth_core, 16 * 1024));
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_dram_buffer_transfer(
-                    sender_device, receiver_device, sender_eth_core, receiver_eth_core, 1000 * 1024));
-            }
-        }
-    }
-}
-
-TEST_F(CommandQueueMultiDeviceFixture, EthKernelsSendInterleavedBufferAllConnectedChips) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    for (const auto& sender_device : devices_) {
-        for (const auto& receiver_device : devices_) {
-            if (sender_device->id() >= receiver_device->id()) {
-                continue;
-            }
-            for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) {
-                auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core);
-                if (receiver_device->id() != device_id) {
-                    continue;
-                }
-
-                log_info(
-                    tt::LogTest,
-                    "Sending interleaved buffer from device {} to device {}, using eth core {} and {}",
-                    sender_device->id(),
-                    receiver_device->id(),
-                    sender_eth_core.str(),
-                    receiver_eth_core.str());
-                BankedConfig test_config = BankedConfig{
-                    .num_pages = 200,
-                    .size_bytes = 200 * 2 * 32 * 32,
-                    .page_size_bytes = 2 * 32 * 32,
-                    .input_buffer_type = BufferType::L1,
-                    .output_buffer_type = BufferType::DRAM};
-
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
-                    sender_device,
-                    receiver_device,
-                    sender_eth_core,
-                    receiver_eth_core,
-                    test_config,
-                    test_config.page_size_bytes));
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
-                    sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
-                test_config = BankedConfig{
-                    .num_pages = 200,
-                    .size_bytes = 200 * 2 * 32 * 32,
-                    .page_size_bytes = 2 * 32 * 32,
-                    .input_buffer_type = BufferType::DRAM,
-                    .output_buffer_type = BufferType::L1};
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
-                    sender_device,
-                    receiver_device,
-                    sender_eth_core,
-                    receiver_eth_core,
-                    test_config,
-                    test_config.page_size_bytes));
-                ASSERT_TRUE(fd_unit_tests::erisc::kernels::chip_to_chip_interleaved_buffer_transfer(
-                    sender_device, receiver_device, sender_eth_core, receiver_eth_core, test_config, MAX_BUFFER_SIZE));
-            }
-        }
-    }
-}
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
deleted file mode 100644
index 3f0441da418..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
+++ /dev/null
@@ -1,495 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <random>
-
-#include "command_queue_fixture.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/impl/kernels/kernel.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "tt_metal/test_utils/comparison.hpp"
-#include "tt_metal/test_utils/df/df.hpp"
-#include "tt_metal/test_utils/print_helpers.hpp"
-#include "tt_metal/test_utils/stimulus.hpp"
-
-using std::vector;
-using namespace tt;
-using namespace tt::test_utils;
-using namespace tt::test_utils::df;
-
-namespace {
-namespace CMAKE_UNIQUE_NAMESPACE {
-constexpr std::int32_t WORD_SIZE = 16;  // 16 bytes per eth send packet
-constexpr std::int32_t MAX_NUM_WORDS =
-    (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE) / WORD_SIZE;
-constexpr std::int32_t MAX_BUFFER_SIZE =
-    (eth_l1_mem::address_map::MAX_L1_LOADING_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE);
-
-struct BankedConfig {
-    size_t num_pages = 1;
-    size_t size_bytes = 1 * 2 * 32 * 32;
-    size_t page_size_bytes = 2 * 32 * 32;
-    tt_metal::BufferType input_buffer_type = tt_metal::BufferType::L1;
-    tt_metal::BufferType output_buffer_type = tt_metal::BufferType::L1;
-    tt::DataFormat l1_data_format = tt::DataFormat::Float16_b;
-};
-
-std::vector<int> get_hamiltonian_cycle(vector<vector<int>>& adj, int N, int s = 0) {
-    std::vector<std::vector<int>> dp(N, std::vector<int>(1 << N, -1));
-
-    for (int i = 0; i < N; ++i) {
-        if (adj[s][i]) {
-            dp[i][(1 << i)] = i;
-        }
-    }
-
-    for (int i = 0; i < (1 << N); ++i) {
-        for (int j = 0; j < N; ++j) {
-            if (i & (1 << j)) {
-                for (int k = 0; k < N; ++k) {
-                    if (i & (1 << k) && adj[k][j] && j != k && dp[k][i ^ (1 << j)] != -1) {
-                        dp[j][i] = k;
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-    for (int i = 0; i < N; ++i) {
-        int m = (1 << N) - 1;
-
-        if (dp[i][m] != -1 && i == s) {
-            std::vector<int> path;
-            path.reserve(N + 1);
-            path.push_back(i);
-
-            for (int j = 0; j < N - 1; ++j) {
-                path.push_back(dp[*path.rbegin()][m]);
-                m ^= 1 << *(path.rbegin() + 1);
-            }
-            path.push_back(s);
-            return path;
-        }
-    }
-    return {};
-}
-
-std::vector<tt_metal::Device*> get_device_ring(std::vector<tt::tt_metal::Device*> devices) {
-    std::vector<std::vector<int>> adj(devices.size(), std::vector<int>(devices.size(), 0));
-    for (uint32_t i = 0; i < devices.size(); ++i) {
-        const auto& device = devices[i];
-        for (const auto& connected_device_id : device->get_ethernet_connected_device_ids()) {
-            for (uint32_t j = 0; j < devices.size(); ++j) {
-                if (devices[j]->id() == connected_device_id) {
-                    adj[i][j] = 1;
-                }
-            }
-        }
-    }
-
-    const auto& device_ring_idx = get_hamiltonian_cycle(adj, devices.size(), 0);
-    std::vector<tt_metal::Device*> device_ring;
-    device_ring.reserve(device_ring_idx.size());
-    for (const auto& device_idx : device_ring_idx) {
-        device_ring.push_back(devices[device_idx]);
-    }
-    return device_ring;
-}
-
-std::vector<std::tuple<tt_metal::Device*, tt_metal::Device*, CoreCoord, CoreCoord>> get_sender_receiver_cores(
-    std::vector<tt::tt_metal::Device*> device_ring) {
-    std::vector<std::tuple<tt_metal::Device*, tt_metal::Device*, CoreCoord, CoreCoord>> sender_receivers;
-    sender_receivers.reserve(device_ring.size() - 1);
-
-    // Special case for 2 devices to ensure core pairs are not the same for send and receive
-    if (device_ring.size() - 1 == 2) {
-        const auto& first_device = device_ring[0];
-        const auto& second_device = device_ring[1];
-        uint32_t i = 0;
-        for (const auto& first_eth_core : first_device->get_active_ethernet_cores(true)) {
-            auto [device_id, second_eth_core] = first_device->get_connected_ethernet_core(first_eth_core);
-            if (second_device->id() == device_id) {
-                tt_metal::Device *sender_device, *receiver_device;
-                CoreCoord sender_eth_core, receiver_eth_core;
-                if (i == 0) {
-                    sender_device = first_device, receiver_device = second_device;
-                    sender_eth_core = first_eth_core, receiver_eth_core = second_eth_core;
-                } else {
-                    sender_device = second_device, receiver_device = first_device;
-                    sender_eth_core = second_eth_core, receiver_eth_core = first_eth_core;
-                }
-                sender_receivers.push_back({sender_device, receiver_device, sender_eth_core, receiver_eth_core});
-                log_info(
-                    tt::LogTest,
-                    "Sender: {} Receiver: {} Sender Eth: {} Receiver Eth: {}",
-                    sender_device->id(),
-                    receiver_device->id(),
-                    sender_eth_core.str(),
-                    receiver_eth_core.str());
-                if (i > 0) {
-                    break;
-                }
-                i++;
-            }
-        }
-    } else {
-        for (uint32_t i = 0; i < device_ring.size() - 1; ++i) {
-            const auto& sender_device = device_ring[i];
-            const auto& receiver_device = device_ring[i + 1];
-            for (const auto& sender_eth_core : sender_device->get_active_ethernet_cores(true)) {
-                auto [device_id, receiver_eth_core] = sender_device->get_connected_ethernet_core(sender_eth_core);
-                if (receiver_device->id() == device_id) {
-                    sender_receivers.push_back({sender_device, receiver_device, sender_eth_core, receiver_eth_core});
-                    log_info(
-                        tt::LogTest,
-                        "Sender: {} Receiver: {} Sender Eth: {} Receiver Eth: {}",
-                        sender_device->id(),
-                        receiver_device->id(),
-                        sender_eth_core.str(),
-                        receiver_eth_core.str());
-                    break;
-                }
-            }
-        }
-    }
-    return sender_receivers;
-}
-}
-}
-
-namespace fd_unit_tests::erisc::kernels {
-
-bool eth_direct_ring_gather_sender_receiver_kernels(
-    std::vector<tt::tt_metal::Device*> device_ring,
-    const size_t& byte_size_per_device,
-    const size_t& src_eth_l1_byte_address,
-    const size_t& dst_eth_l1_byte_address,
-    const size_t& sem_l1_byte_address,
-    uint32_t num_bytes_per_send = 16) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    bool pass = true;
-    const auto& sender_receivers = get_sender_receiver_cores(device_ring);
-
-    // Generate inputs
-    uint32_t numel = byte_size_per_device / sizeof(uint32_t);
-    std::vector<std::vector<uint32_t>> inputs;
-    inputs.reserve(sender_receivers.size());
-    std::vector<uint32_t> all_zeros(numel * sender_receivers.size(), 0);
-    std::map<chip_id_t, tt_metal::Program> programs;
-    std::vector<uint32_t> full_input;
-    full_input.reserve(numel * sender_receivers.size());
-
-    for (uint32_t i = 0; i < sender_receivers.size(); ++i) {
-        inputs.emplace_back(
-            generate_uniform_random_vector<uint32_t>(0, 100, byte_size_per_device / sizeof(uint32_t), i));
-        full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end());
-
-        ////////////////////////////////////////////////////////////////////////////
-        //                      Sender Device
-        ////////////////////////////////////////////////////////////////////////////
-        const auto& [sender_device, receiver_device, eth_sender_core, eth_receiver_core] = sender_receivers[i];
-        auto& sender_program = programs[sender_device->id()];
-        auto& receiver_program = programs[receiver_device->id()];
-        CoreCoord sender_receiver_core;
-        for (uint32_t j = 0; j < sender_receivers.size(); ++j) {
-            if (std::get<1>(sender_receivers[j])->id() == sender_device->id()) {
-                sender_receiver_core = sender_device->ethernet_core_from_logical_core(std::get<3>(sender_receivers[j]));
-            }
-        }
-        auto eth_sender_kernel = tt_metal::CreateKernel(
-            sender_program,
-            "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_ring_gather_send.cpp",
-            eth_sender_core,
-            tt_metal::EthernetConfig{
-                .noc = tt_metal::NOC::NOC_0,
-                .compile_args = {
-                    uint32_t(num_bytes_per_send),
-                    uint32_t(num_bytes_per_send >> 4),
-                    uint32_t(sender_receiver_core.x),
-                    uint32_t(sender_receiver_core.y)}});
-
-        tt_metal::SetRuntimeArgs(
-            sender_program,
-            eth_sender_kernel,
-            eth_sender_core,
-            {(uint32_t)(src_eth_l1_byte_address + (sender_receivers.size() - 1) * byte_size_per_device),
-             (uint32_t)dst_eth_l1_byte_address,
-             (uint32_t)byte_size_per_device,
-             (uint32_t)sender_receivers.size() - 1,
-             (uint32_t)(src_eth_l1_byte_address + i * byte_size_per_device),
-             (uint32_t)i,
-             (uint32_t)sem_l1_byte_address});
-
-        llrt::write_hex_vec_to_core(
-            sender_device->id(),
-            sender_device->ethernet_core_from_logical_core(eth_sender_core),
-            inputs[i],
-            src_eth_l1_byte_address + i * byte_size_per_device);
-        llrt::write_hex_vec_to_core(
-            sender_device->id(),
-            sender_device->ethernet_core_from_logical_core(eth_sender_core),
-            std::vector{INVALID},
-            sem_l1_byte_address);
-
-        ////////////////////////////////////////////////////////////////////////////
-        //                      Receiver Device
-        ////////////////////////////////////////////////////////////////////////////
-        // Clear expected value at ethernet L1 address
-        CoreCoord receiver_sender_core;
-        for (uint32_t j = 0; j < sender_receivers.size(); ++j) {
-            if (std::get<0>(sender_receivers[j])->id() == receiver_device->id()) {
-                receiver_sender_core =
-                    receiver_device->ethernet_core_from_logical_core(std::get<2>(sender_receivers[j]));
-            }
-        }
-
-        llrt::write_hex_vec_to_core(
-            receiver_device->id(),
-            receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
-            all_zeros,
-            dst_eth_l1_byte_address);
-        llrt::write_hex_vec_to_core(
-            receiver_device->id(),
-            receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
-            std::vector{INVALID},
-            sem_l1_byte_address);
-        auto eth_receiver_kernel = tt_metal::CreateKernel(
-            receiver_program,
-            "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/eth_l1_direct_ring_gather_receive.cpp",
-            eth_receiver_core,
-            tt_metal::EthernetConfig{
-                .noc = tt_metal::NOC::NOC_1,
-                .compile_args = {
-                    uint32_t(receiver_sender_core.x),
-                    uint32_t(receiver_sender_core.y)}});  // probably want to use NOC_1 here
-
-        tt_metal::SetRuntimeArgs(
-            receiver_program,
-            eth_receiver_kernel,
-            eth_receiver_core,
-            {(uint32_t)byte_size_per_device, (uint32_t)sender_receivers.size() - 1, (uint32_t)sem_l1_byte_address});
-    }
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    std::vector<std::reference_wrapper<CommandQueue>> cqs;
-    for (uint32_t i = 0; i < sender_receivers.size(); ++i) {
-        const auto& device = std::get<0>(sender_receivers[i]);
-        tt::tt_metal::detail::CompileProgram(device, programs.at(device->id()));
-        auto& cq = device->command_queue();
-
-        EnqueueProgram(cq, programs.at(device->id()), false);
-        cqs.emplace_back(cq);
-    }
-    for (auto& cq : cqs) {
-        Finish(cq);
-    }
-
-    for (uint32_t i = 0; i < sender_receivers.size(); ++i) {
-        const auto& device = std::get<0>(sender_receivers[i]);
-        const auto& core = std::get<2>(sender_receivers[i]);
-        auto readback_vec = llrt::read_hex_vec_from_core(
-            device->id(),
-            device->ethernet_core_from_logical_core(core),
-            src_eth_l1_byte_address,
-            byte_size_per_device * sender_receivers.size());
-        auto a = std::mismatch(full_input.begin(), full_input.end(), readback_vec.begin());
-        bool p = (a.first == full_input.end());
-        pass &= p;
-        if (not p) {
-            log_error(tt::LogTest, "Mismatch on Device {} at Core: {}", device->id(), core.str());
-            log_error(
-                tt::LogTest, "Position: {} Expected: {} Read: {}", a.first - full_input.begin(), *a.first, *a.second);
-        }
-    }
-
-    return pass;
-}
-
-bool eth_interleaved_ring_gather_sender_receiver_kernels(
-    std::vector<tt::tt_metal::Device*> device_ring,
-    const CMAKE_UNIQUE_NAMESPACE::BankedConfig& cfg,
-    const size_t& src_eth_l1_byte_address,
-    const size_t& dst_eth_l1_byte_address,
-    const size_t& sem_l1_byte_address,
-    uint32_t num_bytes_per_send = 16) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    bool pass = true;
-    const auto& sender_receivers = get_sender_receiver_cores(device_ring);
-
-    // Generate inputs
-    uint32_t numel = cfg.size_bytes / sizeof(uint32_t);
-    std::vector<std::vector<uint32_t>> inputs;
-    inputs.reserve(sender_receivers.size());
-    std::vector<uint32_t> all_zeros(numel * sender_receivers.size(), 0);
-    std::map<chip_id_t, tt_metal::Program> programs;
-    std::vector<uint32_t> full_input;
-    full_input.reserve(numel * sender_receivers.size());
-
-    std::vector<std::shared_ptr<Buffer>> output_buffers;
-    output_buffers.reserve(sender_receivers.size());
-
-    for (uint32_t i = 0; i < sender_receivers.size(); ++i) {
-        inputs.emplace_back(
-            tt::test_utils::generate_packed_uniform_random_vector<uint32_t, bfloat16>(
-                -1.0f, 1.0f, cfg.size_bytes / bfloat16::SIZEOF, i));
-        full_input.insert(full_input.begin() + i * numel, inputs[i].begin(), inputs[i].end());
-
-        const auto& device = std::get<0>(sender_receivers[i]);
-        const auto& eth_sender_core = std::get<2>(sender_receivers[i]);
-        CoreCoord eth_receiver_core;
-        for (uint32_t j = 0; j < sender_receivers.size(); ++j) {
-            if (std::get<1>(sender_receivers[j])->id() == device->id()) {
-                eth_receiver_core = std::get<3>(sender_receivers[j]);
-                break;
-            }
-        }
-
-        auto& program = programs[device->id()];
-
-        auto input_buffer =
-            CreateBuffer(InterleavedBufferConfig{device, cfg.size_bytes, cfg.page_size_bytes, cfg.input_buffer_type});
-        bool input_is_dram = cfg.input_buffer_type == tt_metal::BufferType::DRAM;
-        tt_metal::detail::WriteToBuffer(input_buffer, inputs[i]);
-        output_buffers.emplace_back(CreateBuffer(InterleavedBufferConfig{
-            device, cfg.size_bytes * sender_receivers.size(), cfg.page_size_bytes, cfg.output_buffer_type}));
-        tt_metal::detail::WriteToBuffer(output_buffers[i], all_zeros);
-
-        auto eth_sender_kernel = tt_metal::CreateKernel(
-            program,
-            "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_eth_ring_gather_send.cpp",
-            eth_sender_core,
-            tt_metal::EthernetConfig{
-                .noc = tt_metal::NOC::NOC_0,
-                .compile_args = {
-                    uint32_t(num_bytes_per_send),
-                    uint32_t(num_bytes_per_send >> 4),
-                    uint32_t(device->ethernet_core_from_logical_core(eth_receiver_core).x),
-                    uint32_t(device->ethernet_core_from_logical_core(eth_receiver_core).y),
-                    uint32_t(input_buffer->buffer_type() == tt_metal::BufferType::DRAM),
-                    uint32_t(output_buffers[i]->buffer_type() == tt_metal::BufferType::DRAM)}});
-
-        tt_metal::SetRuntimeArgs(
-            program,
-            eth_sender_kernel,
-            eth_sender_core,
-            {(uint32_t)(src_eth_l1_byte_address),
-             (uint32_t)dst_eth_l1_byte_address,
-             (uint32_t)cfg.size_bytes + 32,  // + 32 for idx
-             (uint32_t)sender_receivers.size() - 1,
-             (uint32_t)(i * cfg.num_pages),
-             (uint32_t)input_buffer->address(),
-             (uint32_t)output_buffers[i]->address(),
-             (uint32_t)cfg.num_pages,
-             (uint32_t)cfg.page_size_bytes,
-             (uint32_t)sem_l1_byte_address});
-        llrt::write_hex_vec_to_core(
-            device->id(), device->ethernet_core_from_logical_core(eth_sender_core), std::vector{INVALID}, sem_l1_byte_address);
-
-        llrt::write_hex_vec_to_core(
-            device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), std::vector{INVALID}, sem_l1_byte_address);
-
-        auto eth_receiver_kernel = tt_metal::CreateKernel(
-            program,
-            "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/interleaved_eth_ring_gather_receive.cpp",
-            eth_receiver_core,
-            tt_metal::EthernetConfig{
-                .noc = tt_metal::NOC::NOC_1,
-                .compile_args = {
-                    uint32_t(device->ethernet_core_from_logical_core(eth_sender_core).x),
-                    uint32_t(device->ethernet_core_from_logical_core(eth_sender_core).y),
-                    uint32_t(
-                        output_buffers[i]->buffer_type() == tt_metal::BufferType::DRAM)}});  // probably want to use NOC_1 here
-
-        tt_metal::SetRuntimeArgs(
-            program,
-            eth_receiver_kernel,
-            eth_receiver_core,
-            {(uint32_t)dst_eth_l1_byte_address,
-             (uint32_t)cfg.size_bytes + 32,  // + 32 for idx
-             (uint32_t)sender_receivers.size() - 1,
-             (uint32_t)output_buffers[i]->address(),
-             (uint32_t)cfg.num_pages,
-             (uint32_t)cfg.page_size_bytes,
-             (uint32_t)sem_l1_byte_address});
-    }
-
-    ////////////////////////////////////////////////////////////////////////////
-    //                      Compile and Execute Application
-    ////////////////////////////////////////////////////////////////////////////
-
-    std::vector<std::reference_wrapper<CommandQueue>> cqs;
-    for (uint32_t i = 0; i < sender_receivers.size(); ++i) {
-        const auto& device = std::get<0>(sender_receivers[i]);
-        tt::tt_metal::detail::CompileProgram(device, programs.at(device->id()));
-        auto& cq = device->command_queue();
-
-        EnqueueProgram(cq, programs.at(device->id()), false);
-        cqs.emplace_back(cq);
-    }
-    for (auto& cq : cqs) {
-        Finish(cq);
-    }
-
-    for (uint32_t i = 0; i < sender_receivers.size(); ++i) {
-        const auto& device = std::get<0>(sender_receivers[i]);
-        const auto& core = std::get<2>(sender_receivers[i]);
-        std::vector<uint32_t> readback_vec;
-        tt_metal::detail::ReadFromBuffer(output_buffers[i], readback_vec);
-        auto a = std::mismatch(full_input.begin(), full_input.end(), readback_vec.begin());
-        bool p = (a.first == full_input.end());
-        pass &= p;
-        if (not p) {
-            log_error(tt::LogTest, "Mismatch on Device {} at Core: {}", device->id(), core.str());
-            log_error(
-                tt::LogTest, "Position: {} Expected: {} Read: {}", a.first - full_input.begin(), *a.first, *a.second);
-        }
-    }
-
-    return pass;
-}
-}  // namespace fd_unit_tests::erisc::kernels
-
-TEST_F(CommandQueueMultiDeviceFixture, EthKernelsDirectRingGatherAllChips) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    if (num_devices_ < 4) {
-        GTEST_SKIP();
-    }
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32;
-    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32;
-    const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    const auto& device_ring = get_device_ring(devices_);
-    if (device_ring.empty()) {
-        GTEST_SKIP();
-    }
-    ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_direct_ring_gather_sender_receiver_kernels(
-        device_ring, WORD_SIZE, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address));
-}
-
-TEST_F(CommandQueueMultiDeviceFixture, EthKernelsInterleavedRingGatherAllChips) {
-    using namespace CMAKE_UNIQUE_NAMESPACE;
-    if (num_devices_ < 4) {
-        GTEST_SKIP();
-    }
-    const size_t src_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32;
-    const size_t dst_eth_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE + 32;
-    const size_t sem_l1_byte_address = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE;
-    BankedConfig test_config =
-        BankedConfig{.num_pages = 10, .size_bytes = 10 * 2 * 32 * 32, .page_size_bytes = 2 * 32 * 32};
-    const auto& device_ring = get_device_ring(devices_);
-    if (device_ring.empty()) {
-        GTEST_SKIP();
-    }
-    ASSERT_TRUE(fd_unit_tests::erisc::kernels::eth_interleaved_ring_gather_sender_receiver_kernels(
-        device_ring, test_config, src_eth_l1_byte_address, dst_eth_l1_byte_address, sem_l1_byte_address));
-}
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/test_sub_device.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/test_sub_device.cpp
deleted file mode 100644
index 83ef6759c4d..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/test_sub_device.cpp
+++ /dev/null
@@ -1,585 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <cstddef>
-#include <cstdint>
-#include <array>
-#include <tuple>
-#include <vector>
-
-#include "command_queue_fixture.hpp"
-#include "gtest/gtest.h"
-#include "tt_metal/common/core_coord.hpp"
-#include "tt_metal/impl/buffers/global_semaphore.hpp"
-#include "tt_metal/impl/device/device.hpp"
-#include "tt_metal/impl/event/event.hpp"
-#include "tt_metal/impl/sub_device/sub_device.hpp"
-#include "tests/tt_metal/test_utils/stimulus.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/command_queue_test_utils.hpp"
-
-using namespace tt::tt_metal;
-
-namespace basic_tests {
-
-std::tuple<Program, CoreCoord, std::unique_ptr<GlobalSemaphore>> create_single_sync_program(Device *device, SubDevice sub_device) {
-    auto syncer_coord = sub_device.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord;
-    auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord));
-    auto global_sem = CreateGlobalSemaphore(device, sub_device.cores(HalProgrammableCoreType::TENSIX), INVALID);
-
-    Program syncer_program = CreateProgram();
-    auto syncer_kernel = CreateKernel(
-        syncer_program,
-        "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp",
-        syncer_core,
-        DataMovementConfig{
-            .processor = DataMovementProcessor::RISCV_0,
-            .noc = NOC::RISCV_0_default});
-    std::array<uint32_t, 1> syncer_rt_args = {global_sem->address()};
-    SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args);
-    return {std::move(syncer_program), std::move(syncer_coord), std::move(global_sem)};
-}
-
-std::tuple<Program, Program, Program, std::unique_ptr<GlobalSemaphore>> create_basic_sync_program(Device *device, const SubDevice& sub_device_1, const SubDevice& sub_device_2) {
-    auto waiter_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord;
-    auto waiter_core = CoreRangeSet(CoreRange(waiter_coord, waiter_coord));
-    auto waiter_core_physical = device->worker_core_from_logical_core(waiter_coord);
-    auto incrementer_cores = sub_device_1.cores(HalProgrammableCoreType::TENSIX);
-    auto syncer_coord = incrementer_cores.ranges().back().end_coord;
-    auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord));
-    auto syncer_core_physical = device->worker_core_from_logical_core(syncer_coord);
-    auto all_cores = waiter_core.merge(incrementer_cores).merge(syncer_core);
-    auto global_sem = CreateGlobalSemaphore(device, all_cores, INVALID);
-
-    Program waiter_program = CreateProgram();
-    auto waiter_kernel = CreateKernel(
-        waiter_program,
-        "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_waiter.cpp",
-        waiter_core,
-        DataMovementConfig{
-            .processor = DataMovementProcessor::RISCV_0,
-            .noc = NOC::RISCV_0_default});
-    std::array<uint32_t, 4> waiter_rt_args = {global_sem->address(), incrementer_cores.num_cores(), syncer_core_physical.x, syncer_core_physical.y};
-    SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args);
-
-    Program syncer_program = CreateProgram();
-    auto syncer_kernel = CreateKernel(
-        syncer_program,
-        "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp",
-        syncer_core,
-        DataMovementConfig{
-            .processor = DataMovementProcessor::RISCV_0,
-            .noc = NOC::RISCV_0_default});
-    std::array<uint32_t, 1> syncer_rt_args = {global_sem->address()};
-    SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args);
-
-    Program incrementer_program = CreateProgram();
-    auto incrementer_kernel = CreateKernel(
-        incrementer_program,
-        "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/incrementer.cpp",
-        incrementer_cores,
-        DataMovementConfig{
-            .processor = DataMovementProcessor::RISCV_1,
-            .noc = NOC::RISCV_1_default});
-    std::array<uint32_t, 3> incrementer_rt_args = {global_sem->address(), waiter_core_physical.x, waiter_core_physical.y};
-    SetRuntimeArgs(incrementer_program, incrementer_kernel, incrementer_cores, incrementer_rt_args);
-    return {std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)};
-}
-
-std::tuple<Program, Program, Program, std::unique_ptr<GlobalSemaphore>> create_basic_eth_sync_program(Device *device, const SubDevice& sub_device_1, const SubDevice& sub_device_2) {
-    auto waiter_coord = sub_device_2.cores(HalProgrammableCoreType::ACTIVE_ETH).ranges().at(0).start_coord;
-    auto waiter_core = CoreRangeSet(CoreRange(waiter_coord, waiter_coord));
-    auto waiter_core_physical = device->ethernet_core_from_logical_core(waiter_coord);
-    auto tensix_waiter_coord = sub_device_2.cores(HalProgrammableCoreType::TENSIX).ranges().at(0).start_coord;
-    auto tensix_waiter_core = CoreRangeSet(CoreRange(tensix_waiter_coord, tensix_waiter_coord));
-    auto tensix_waiter_core_physical = device->worker_core_from_logical_core(tensix_waiter_coord);
-    auto incrementer_cores = sub_device_1.cores(HalProgrammableCoreType::TENSIX);
-    auto syncer_coord = incrementer_cores.ranges().back().end_coord;
-    auto syncer_core = CoreRangeSet(CoreRange(syncer_coord, syncer_coord));
-    auto syncer_core_physical = device->worker_core_from_logical_core(syncer_coord);
-    auto all_cores = tensix_waiter_core.merge(incrementer_cores).merge(syncer_core);
-    auto global_sem = CreateGlobalSemaphore(device, all_cores, INVALID);
-
-    Program waiter_program = CreateProgram();
-    auto waiter_kernel = CreateKernel(
-        waiter_program,
-        "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/persistent_remote_waiter.cpp",
-        waiter_core,
-        EthernetConfig{
-            .noc = NOC::RISCV_0_default,
-            .processor = DataMovementProcessor::RISCV_0});
-    std::array<uint32_t, 7> waiter_rt_args = {global_sem->address(), incrementer_cores.num_cores(), syncer_core_physical.x, syncer_core_physical.y, tensix_waiter_core_physical.x, tensix_waiter_core_physical.y, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE};
-    SetRuntimeArgs(waiter_program, waiter_kernel, waiter_core, waiter_rt_args);
-
-    Program syncer_program = CreateProgram();
-    auto syncer_kernel = CreateKernel(
-        syncer_program,
-        "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/syncer.cpp",
-        syncer_core,
-        DataMovementConfig{
-            .processor = DataMovementProcessor::RISCV_0,
-            .noc = NOC::RISCV_0_default});
-    std::array<uint32_t, 1> syncer_rt_args = {global_sem->address()};
-    SetRuntimeArgs(syncer_program, syncer_kernel, syncer_core, syncer_rt_args);
-
-    Program incrementer_program = CreateProgram();
-    auto incrementer_kernel = CreateKernel(
-        incrementer_program,
-        "tests/tt_metal/tt_metal/unit_tests_fast_dispatch/sub_device/kernels/incrementer.cpp",
-        incrementer_cores,
-        DataMovementConfig{
-            .processor = DataMovementProcessor::RISCV_1,
-            .noc = NOC::RISCV_1_default});
-    std::array<uint32_t, 3> incrementer_rt_args = {global_sem->address(), tensix_waiter_core_physical.x, tensix_waiter_core_physical.y};
-    SetRuntimeArgs(incrementer_program, incrementer_kernel, incrementer_cores, incrementer_rt_args);
-    return {std::move(waiter_program), std::move(syncer_program), std::move(incrementer_program), std::move(global_sem)};
-}
-
-TEST_F(CommandQueueSingleCardFixture, TestSubDeviceAllocations) {
-    uint32_t local_l1_size = 3200;
-    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
-    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
-    CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2});
-    CoreRangeSet sharded_cores_2 = CoreRange({4, 4}, {4, 4});
-
-    auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true);
-    auto sharded_cores_2_vec = corerange_to_cores(sharded_cores_2, std::nullopt, true);
-
-    ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1});
-    uint32_t page_size_1 = 32;
-    ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1};
-    auto input_1 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_1.size / sizeof(uint32_t));
-
-    ShardSpecBuffer shard_spec_buffer_2 = ShardSpecBuffer(sharded_cores_2, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_2.num_cores(), 1});
-    uint32_t page_size_2 = 64;
-    ShardedBufferConfig shard_config_2 = {nullptr, sharded_cores_2.num_cores() * page_size_2, page_size_2, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_2};
-    auto input_2 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_2.size / sizeof(uint32_t));
-
-    uint32_t page_size_3 = 1024;
-    InterleavedBufferConfig interleaved_config = {nullptr, page_size_3, page_size_3, BufferType::L1, TensorMemoryLayout::INTERLEAVED};
-    auto input_3 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, interleaved_config.size / sizeof(uint32_t));
-
-    for (Device *device : devices_) {
-        auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1}, local_l1_size);
-        auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size);
-        DeviceAddr l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
-        DeviceAddr max_addr = l1_unreserved_base + local_l1_size;
-
-        shard_config_1.device = device;
-        shard_config_2.device = device;
-        interleaved_config.device = device;
-
-        std::vector<CoreCoord> physical_cores_1;
-        physical_cores_1.reserve(sharded_cores_1_vec.size());
-        for (const auto& core : sharded_cores_1_vec) {
-            physical_cores_1.push_back(device->worker_core_from_logical_core(core));
-        }
-
-        std::vector<CoreCoord> physical_cores_2;
-        physical_cores_2.reserve(sharded_cores_2_vec.size());
-        for (const auto& core : sharded_cores_2_vec) {
-            physical_cores_2.push_back(device->worker_core_from_logical_core(core));
-        }
-
-        device->load_sub_device_manager(sub_device_manager_1);
-
-        auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
-        EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size());
-        EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
-        std::vector<uint32_t> output_1;
-        EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
-        EXPECT_EQ(input_1, output_1);
-        auto input_1_it = input_1.begin();
-        for (const auto& physical_core : physical_cores_1) {
-            auto readback = tt::llrt::read_hex_vec_from_core(
-                device->id(), physical_core, buffer_1->address(), page_size_1);
-            EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin()));
-            input_1_it += page_size_1 / sizeof(uint32_t);
-        }
-
-        auto buffer_2 = CreateBuffer(interleaved_config);
-        EXPECT_THROW(CreateBuffer(shard_config_1, SubDeviceId{1}), std::exception);
-        EXPECT_THROW(device->clear_loaded_sub_device_manager(), std::exception);
-        EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception);
-        DeallocateBuffer(*buffer_1);
-        device->clear_loaded_sub_device_manager();
-        device->load_sub_device_manager(sub_device_manager_2);
-
-        auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
-        EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size());
-        EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
-        std::vector<uint32_t> output_2;
-        EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
-        EXPECT_EQ(input_2, output_2);
-        auto input_2_it = input_2.begin();
-        for (const auto& physical_core : physical_cores_2) {
-            auto readback = tt::llrt::read_hex_vec_from_core(
-                device->id(), physical_core, buffer_3->address(), page_size_2);
-            EXPECT_TRUE(std::equal(input_2_it, input_2_it + page_size_2 / sizeof(uint32_t), readback.begin()));
-            input_2_it += page_size_2 / sizeof(uint32_t);
-        }
-
-        auto buffer_4 = CreateBuffer(shard_config_1,  SubDeviceId{0});
-        EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size());
-        EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
-    }
-}
-
-TEST_F(CommandQueueSingleCardFixture, TestSubDeviceSynchronization) {
-    uint32_t local_l1_size = 3200;
-    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
-    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
-    CoreRangeSet sharded_cores_1 = CoreRange({0, 0}, {2, 2});
-
-    auto sharded_cores_1_vec = corerange_to_cores(sharded_cores_1, std::nullopt, true);
-
-    ShardSpecBuffer shard_spec_buffer_1 = ShardSpecBuffer(sharded_cores_1, {1, 1}, ShardOrientation::ROW_MAJOR, false, {1, 1}, {sharded_cores_1.num_cores(), 1});
-    uint32_t page_size_1 = 32;
-    ShardedBufferConfig shard_config_1 = {nullptr, sharded_cores_1.num_cores() * page_size_1, page_size_1, BufferType::L1, TensorMemoryLayout::HEIGHT_SHARDED, shard_spec_buffer_1};
-    auto input_1 = tt::test_utils::generate_uniform_random_vector<uint32_t>(0, 100, shard_config_1.size / sizeof(uint32_t));
-
-    std::array sub_device_ids_to_block = {SubDeviceId{0}};
-    for (Device *device : devices_) {
-        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, local_l1_size);
-
-        shard_config_1.device = device;
-
-        std::vector<CoreCoord> physical_cores_1;
-        physical_cores_1.reserve(sharded_cores_1_vec.size());
-        for (const auto& core : sharded_cores_1_vec) {
-            physical_cores_1.push_back(device->worker_core_from_logical_core(core));
-        }
-
-        device->load_sub_device_manager(sub_device_manager);
-
-        auto [program, syncer_core, global_semaphore] = create_single_sync_program(device, sub_device_2);
-        EnqueueProgram(device->command_queue(), program, false);
-
-        auto buffer_1 = CreateBuffer(shard_config_1, sub_device_ids_to_block[0]);
-
-        // Test blocking synchronize doesn't stall
-        Synchronize(device, 0, sub_device_ids_to_block);
-
-        // Test blocking write buffer doesn't stall
-        EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, true, sub_device_ids_to_block);
-
-        // Test record event won't cause a stall
-        auto event = std::make_shared<Event>();
-        EnqueueRecordEvent(device->command_queue(), event, sub_device_ids_to_block);
-        Synchronize(device, 0, sub_device_ids_to_block);
-
-        // Test blocking read buffer doesn't stall
-        std::vector<uint32_t> output_1;
-        EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true, sub_device_ids_to_block);
-        EXPECT_EQ(input_1, output_1);
-        auto input_1_it = input_1.begin();
-        for (const auto& physical_core : physical_cores_1) {
-            auto readback = tt::llrt::read_hex_vec_from_core(
-                device->id(), physical_core, buffer_1->address(), page_size_1);
-            EXPECT_TRUE(std::equal(input_1_it, input_1_it + page_size_1 / sizeof(uint32_t), readback.begin()));
-            input_1_it += page_size_1 / sizeof(uint32_t);
-        }
-        auto sem_addr = global_semaphore->address();
-        auto physical_syncer_core = device->worker_core_from_logical_core(syncer_core);
-        tt::llrt::write_hex_vec_to_core(device->id(), physical_syncer_core, std::vector<uint32_t>{1}, sem_addr);
-
-        // Full synchronization
-        Synchronize(device);
-    }
-}
-
-TEST_F(CommandQueueSingleCardFixture, TestSubDeviceBasicPrograms) {
-    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
-    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
-    uint32_t num_iters = 5;
-    for (Device *device : devices_) {
-        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
-        device->load_sub_device_manager(sub_device_manager);
-
-        auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(device, sub_device_1, sub_device_2);
-
-        for (uint32_t i = 0; i < num_iters; i++) {
-            EnqueueProgram(device->command_queue(), waiter_program, false);
-            // Test blocking on one sub-device
-            EnqueueProgram(device->command_queue(), syncer_program, true);
-            EnqueueProgram(device->command_queue(), incrementer_program, false);
-        }
-        Synchronize(device);
-    }
-}
-
-TEST_F(CommandQueueSingleCardFixture, TestSubDeviceBasicEthPrograms) {
-    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
-    uint32_t num_iters = 5;
-    for (Device *device : devices_) {
-        if (!does_device_have_active_eth_cores(device)) {
-            GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores";
-        }
-        auto eth_core = *device->get_active_ethernet_cores(true).begin();
-        SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})}), CoreRangeSet(CoreRange(eth_core, eth_core))});
-        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
-        device->load_sub_device_manager(sub_device_manager);
-
-        auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_eth_sync_program(device, sub_device_1, sub_device_2);
-
-        for (uint32_t i = 0; i < num_iters; i++) {
-            EnqueueProgram(device->command_queue(), waiter_program, false);
-            // Test blocking on one sub-device
-            EnqueueProgram(device->command_queue(), syncer_program, true);
-            EnqueueProgram(device->command_queue(), incrementer_program, false);
-        }
-        Synchronize(device);
-    }
-}
-
-TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceTraceBasicPrograms) {
-    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
-    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
-    uint32_t num_iters = 5;
-    for (Device *device : devices_) {
-        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
-        device->load_sub_device_manager(sub_device_manager);
-
-        auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_sync_program(device, sub_device_1, sub_device_2);
-
-        // Compile the programs
-        EnqueueProgram(device->command_queue(), waiter_program, false);
-        // Test blocking on one sub-device
-        EnqueueProgram(device->command_queue(), syncer_program, true);
-        EnqueueProgram(device->command_queue(), incrementer_program, false);
-        Synchronize(device);
-
-        // Capture the trace
-        auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), waiter_program, false);
-        EnqueueProgram(device->command_queue(), syncer_program, false);
-        EnqueueProgram(device->command_queue(), incrementer_program, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_1);
-
-        auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), syncer_program, false);
-        EnqueueProgram(device->command_queue(), incrementer_program, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_2);
-
-        for (uint32_t i = 0; i < num_iters; i++) {
-            // Regular program execution
-            EnqueueProgram(device->command_queue(), waiter_program, false);
-            // Test blocking on one sub-device
-            EnqueueProgram(device->command_queue(), syncer_program, true);
-            EnqueueProgram(device->command_queue(), incrementer_program, false);
-
-            // Full trace execution
-            ReplayTrace(device, device->command_queue().id(), tid_1, false);
-
-            // Partial trace execution
-            EnqueueProgram(device->command_queue(), waiter_program, false);
-            ReplayTrace(device, device->command_queue().id(), tid_2, false);
-        }
-        Synchronize(device);
-    }
-}
-
-TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceTraceBasicEthPrograms) {
-    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
-    uint32_t num_iters = 5;
-    for (Device *device : devices_) {
-        if (!does_device_have_active_eth_cores(device)) {
-            GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores";
-        }
-        auto eth_core = *device->get_active_ethernet_cores(true).begin();
-        SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})}), CoreRangeSet(CoreRange(eth_core, eth_core))});
-        auto sub_device_manager = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
-        device->load_sub_device_manager(sub_device_manager);
-
-        auto [waiter_program, syncer_program, incrementer_program, global_sem] = create_basic_eth_sync_program(device, sub_device_1, sub_device_2);
-
-        // Compile the programs
-        EnqueueProgram(device->command_queue(), waiter_program, false);
-        // Test blocking on one sub-device
-        EnqueueProgram(device->command_queue(), syncer_program, true);
-        EnqueueProgram(device->command_queue(), incrementer_program, false);
-        Synchronize(device);
-
-        // Capture the trace
-        auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), waiter_program, false);
-        EnqueueProgram(device->command_queue(), syncer_program, false);
-        EnqueueProgram(device->command_queue(), incrementer_program, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_1);
-
-        auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), syncer_program, false);
-        EnqueueProgram(device->command_queue(), incrementer_program, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_2);
-
-        for (uint32_t i = 0; i < num_iters; i++) {
-            // Regular program execution
-            EnqueueProgram(device->command_queue(), waiter_program, false);
-            // Test blocking on one sub-device
-            EnqueueProgram(device->command_queue(), syncer_program, true);
-            EnqueueProgram(device->command_queue(), incrementer_program, false);
-
-            // Full trace execution
-            ReplayTrace(device, device->command_queue().id(), tid_1, false);
-
-            // Partial trace execution
-            EnqueueProgram(device->command_queue(), waiter_program, false);
-            ReplayTrace(device, device->command_queue().id(), tid_2, false);
-        }
-        Synchronize(device);
-    }
-}
-
-TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceTraceProgramsReconfigureSubDevices) {
-    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
-    SubDevice sub_device_2(std::array{CoreRangeSet(std::array{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
-    SubDevice sub_device_3(std::array{CoreRangeSet(std::array{CoreRange({2, 4}, {3, 4}), CoreRange({5, 1}, {6, 3})})});
-    uint32_t num_iters = 5;
-    for (Device *device : devices_) {
-        if (!does_device_have_active_eth_cores(device)) {
-            GTEST_SKIP() << "Skipping test because device " << device->id() << " does not have any active ethernet cores";
-        }
-        auto eth_core = *device->get_active_ethernet_cores(true).begin();
-        SubDevice sub_device_4(std::array{CoreRangeSet(std::array{CoreRange({2, 1}, {2, 2}), CoreRange({1, 5}, {5, 5})}), CoreRangeSet(CoreRange(eth_core, eth_core))});
-
-        auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
-        auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_3, sub_device_4}, 3200);
-
-        device->load_sub_device_manager(sub_device_manager_1);
-
-        auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2);
-
-        // Compile the programs
-        EnqueueProgram(device->command_queue(), waiter_program_1, false);
-        EnqueueProgram(device->command_queue(), syncer_program_1, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
-        Synchronize(device);
-
-        // Capture the trace
-        auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), waiter_program_1, false);
-        EnqueueProgram(device->command_queue(), syncer_program_1, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_1);
-
-        auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), syncer_program_1, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_2);
-
-        device->load_sub_device_manager(sub_device_manager_2);
-
-        auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_eth_sync_program(device, sub_device_3, sub_device_4);
-
-        // Compile the programs
-        EnqueueProgram(device->command_queue(), waiter_program_2, false);
-        EnqueueProgram(device->command_queue(), syncer_program_2, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
-        Synchronize(device);
-
-        // Capture the trace
-        auto tid_3 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), waiter_program_2, false);
-        EnqueueProgram(device->command_queue(), syncer_program_2, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_3);
-
-        auto tid_4 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), syncer_program_2, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_4);
-
-        for (uint32_t i = 0; i < num_iters; i++) {
-            device->load_sub_device_manager(sub_device_manager_1);
-            // Regular program execution
-            EnqueueProgram(device->command_queue(), waiter_program_1, false);
-            // Test blocking on one sub-device
-            EnqueueProgram(device->command_queue(), syncer_program_1, false);
-            EnqueueProgram(device->command_queue(), incrementer_program_1, false);
-
-            // Full trace execution
-            ReplayTrace(device, device->command_queue().id(), tid_1, false);
-
-            // Partial trace execution
-            EnqueueProgram(device->command_queue(), waiter_program_1, false);
-            ReplayTrace(device, device->command_queue().id(), tid_2, false);
-
-            device->load_sub_device_manager(sub_device_manager_2);
-            // Regular program execution
-            EnqueueProgram(device->command_queue(), waiter_program_2, false);
-            // Test blocking on one sub-device
-            EnqueueProgram(device->command_queue(), syncer_program_2, false);
-            EnqueueProgram(device->command_queue(), incrementer_program_2, false);
-
-            // Full trace execution
-            ReplayTrace(device, device->command_queue().id(), tid_3, false);
-
-            // Partial trace execution
-            EnqueueProgram(device->command_queue(), waiter_program_2, false);
-            ReplayTrace(device, device->command_queue().id(), tid_4, false);
-        }
-        Synchronize(device);
-    }
-}
-
-TEST_F(CommandQueueSingleCardTraceFixture, TestSubDeviceIllegalOperations) {
-    SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
-    SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
-
-    // Assert no idle eth cores specified
-    EXPECT_THROW(SubDevice sub_device_3(std::array{CoreRangeSet(CoreRange({3, 3}, {3, 3})), CoreRangeSet(CoreRange({4, 4}, {4, 4})), CoreRangeSet(CoreRange({5, 5}, {5, 5}))}), std::exception);
-    for (Device *device : devices_) {
-        auto sub_device_manager_1 = device->create_sub_device_manager({sub_device_1, sub_device_2}, 3200);
-        auto sub_device_manager_2 = device->create_sub_device_manager({sub_device_2, sub_device_1}, 3200);
-        device->load_sub_device_manager(sub_device_manager_1);
-
-        auto [waiter_program_1, syncer_program_1, incrementer_program_1, global_sem_1] = create_basic_sync_program(device, sub_device_1, sub_device_2);
-
-        // Compile the programs
-        EnqueueProgram(device->command_queue(), waiter_program_1, false);
-        // Test blocking on one sub-device
-        EnqueueProgram(device->command_queue(), syncer_program_1, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
-        Synchronize(device);
-
-        // Capture the trace
-        auto tid_1 = BeginTraceCapture(device, device->command_queue().id());
-        // Can not load a sub-device manager while tracing
-        EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_2), std::exception);
-        EnqueueProgram(device->command_queue(), waiter_program_1, false);
-        EnqueueProgram(device->command_queue(), syncer_program_1, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_1, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_1);
-
-        device->load_sub_device_manager(sub_device_manager_2);
-        auto [waiter_program_2, syncer_program_2, incrementer_program_2, global_sem_2] = create_basic_sync_program(device, sub_device_2, sub_device_1);
-
-        EnqueueProgram(device->command_queue(), waiter_program_2, false);
-        EnqueueProgram(device->command_queue(), syncer_program_2, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
-        Synchronize(device);
-
-        auto tid_2 = BeginTraceCapture(device, device->command_queue().id());
-        EnqueueProgram(device->command_queue(), waiter_program_2, false);
-        EnqueueProgram(device->command_queue(), syncer_program_2, false);
-        EnqueueProgram(device->command_queue(), incrementer_program_2, false);
-        EndTraceCapture(device, device->command_queue().id(), tid_2);
-
-        // Regular program execution
-        // Can not run a program on a different sub-device manager
-        EXPECT_THROW(EnqueueProgram(device->command_queue(), waiter_program_1, false), std::exception);
-
-        // Full trace execution
-        ReplayTrace(device, device->command_queue().id(), tid_2, false);
-
-        // Can not replay a trace on a different sub-device manager
-        EXPECT_THROW(ReplayTrace(device, device->command_queue().id(), tid_1, false), std::exception);
-
-        Synchronize(device);
-
-        device->remove_sub_device_manager(sub_device_manager_1);
-        EXPECT_THROW(device->load_sub_device_manager(sub_device_manager_1), std::exception);
-    }
-}
-
-}  // namespace basic_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/tests_main.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/tests_main.cpp
deleted file mode 100644
index 1e42f41a46c..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/tests_main.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "gtest/gtest.h"
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt
deleted file mode 100644
index 00e5b547319..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-set(UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueProgram.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueTrace.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWaitForEvent.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
-)
-
-add_executable(unit_tests_fast_dispatch_single_chip_multi_queue ${UNIT_TESTS_FD_SINGLEC_MULTIQ_SRCS})
-TT_ENABLE_UNITY_BUILD(unit_tests_fast_dispatch_single_chip_multi_queue)
-
-target_link_libraries(unit_tests_fast_dispatch_single_chip_multi_queue PUBLIC test_metal_common_libs)
-target_include_directories(
-    unit_tests_fast_dispatch_single_chip_multi_queue
-    PRIVATE
-        ${PROJECT_SOURCE_DIR}
-        ${PROJECT_SOURCE_DIR}/tt_metal
-        ${PROJECT_SOURCE_DIR}/tests
-        ${CMAKE_CURRENT_SOURCE_DIR}
-        ${CMAKE_CURRENT_SOURCE_DIR}/common
-)
-set_target_properties(
-    unit_tests_fast_dispatch_single_chip_multi_queue
-    PROPERTIES
-        RUNTIME_OUTPUT_DIRECTORY
-            ${PROJECT_BINARY_DIR}/test/tt_metal
-)
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp
deleted file mode 100644
index 4e407df6d4e..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueProgram.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <memory>
-#include "command_queue_fixture.hpp"
-#include "command_queue_test_utils.hpp"
-#include "gtest/gtest.h"
-#include "impl/buffers/buffer.hpp"
-#include "impl/device/device.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-#include "tt_metal/common/scoped_timer.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-
-using std::vector;
-using namespace tt::tt_metal;
-
-struct CBConfig {
-    uint32_t cb_id;
-    uint32_t num_pages;
-    uint32_t page_size;
-    tt::DataFormat data_format;
-};
-
-struct DummyProgramConfig {
-    CoreRangeSet cr_set;
-    CBConfig cb_config;
-    uint32_t num_cbs;
-    uint32_t num_sems;
-};
-
-struct DummyProgramMultiCBConfig {
-    CoreRangeSet cr_set;
-    std::vector<CBConfig> cb_config_vector;
-    uint32_t num_sems;
-};
-
-
-namespace local_test_functions {
-
-// Create randomly sized pair of unique and common runtime args vectors, with careful not to exceed max between the two.
-// Optionally force the max size for one of the vectors.
-std::pair<std::vector<uint32_t>, std::vector<uint32_t>> create_runtime_args(bool force_max_size = false, uint32_t unique_base = 0, uint32_t common_base = 100){
-
-    constexpr uint32_t MAX_RUNTIME_ARGS = 255;
-
-    // Generate Unique Runtime Args. Common RT args starting address must be L1 Aligned, so account for that here via padding
-    uint32_t num_rt_args_unique = num_rt_args_unique = rand() % (MAX_RUNTIME_ARGS + 1);
-    uint32_t num_rt_args_unique_padded = align(num_rt_args_unique, hal.get_alignment(HalMemType::L1) / sizeof(uint32_t));
-    uint32_t num_rt_args_common = num_rt_args_unique_padded < MAX_RUNTIME_ARGS ? rand() % (MAX_RUNTIME_ARGS - num_rt_args_unique_padded + 1) : 0;
-
-    if (force_max_size) {
-        if (rand() % 2) {
-            num_rt_args_unique = MAX_RUNTIME_ARGS;
-            num_rt_args_common = 0;
-        } else {
-            num_rt_args_common = MAX_RUNTIME_ARGS;
-            num_rt_args_unique = 0;
-        }
-    }
-
-    vector<uint32_t> rt_args_common;
-    for (uint32_t i = 0; i < num_rt_args_common; i++) {
-        rt_args_common.push_back(common_base + i);
-    }
-
-    vector<uint32_t> rt_args_unique;
-    for (uint32_t i = 0; i < num_rt_args_unique; i++) {
-        rt_args_unique.push_back(unique_base + i);
-    }
-
-    log_trace(tt::LogTest, "{} - num_rt_args_unique: {} num_rt_args_common: {} force_max_size: {}", __FUNCTION__, num_rt_args_unique, num_rt_args_common, force_max_size);
-    return std::make_pair(rt_args_unique, rt_args_common);
-}
-
-
-}  // namespace local_test_functions
-
-namespace stress_tests {
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestRandomizedProgram) {
-    uint32_t NUM_PROGRAMS = 100;
-    uint32_t MAX_LOOP = 100;
-    uint32_t page_size = 1024;
-
-    if (this->arch_ == tt::ARCH::BLACKHOLE) {
-        GTEST_SKIP(); // Running on second CQ is hanging on CI
-    }
-
-    // Make random
-    auto random_seed = 0; // (unsigned int)time(NULL);
-    uint32_t seed = tt::parse_env("SEED", random_seed);
-    log_info(tt::LogTest, "Using Test Seed: {}", seed);
-    srand(seed);
-
-    CoreCoord worker_grid_size = this->device_->compute_with_storage_grid_size();
-    CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-    CoreRangeSet cr_set({cr});
-
-    log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS);
-
-    vector<Program> programs;
-    for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
-        programs.push_back(Program());
-        Program& program = programs.back();
-
-        std::map<string, string> data_movement_defines = {{"DATA_MOVEMENT", "1"}};
-        std::map<string, string> compute_defines = {{"COMPUTE", "1"}};
-
-        // brisc
-        uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS;
-        bool USE_MAX_RT_ARGS;
-
-        if (i == 0) {
-            // Ensures that we get at least one compilation with the max amount to
-            // ensure it compiles and runs
-            BRISC_OUTER_LOOP = MAX_LOOP;
-            BRISC_MIDDLE_LOOP = MAX_LOOP;
-            BRISC_INNER_LOOP = MAX_LOOP;
-            NUM_CBS = NUM_CIRCULAR_BUFFERS;
-            NUM_SEMS = NUM_SEMAPHORES;
-            USE_MAX_RT_ARGS = true;
-        } else {
-            BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
-            BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
-            BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
-            NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1;
-            NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1;
-            USE_MAX_RT_ARGS = false;
-        }
-
-        log_debug(tt::LogTest, "Compiling program {}/{} w/ BRISC_OUTER_LOOP: {} BRISC_MIDDLE_LOOP: {} BRISC_INNER_LOOP: {} NUM_CBS: {} NUM_SEMS: {} USE_MAX_RT_ARGS: {}",
-                 i+1, NUM_PROGRAMS, BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, USE_MAX_RT_ARGS);
-
-        for (uint32_t j = 0; j < NUM_CBS; j++) {
-            CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}}).set_page_size(j, page_size * (j + 1));
-            auto cb = CreateCircularBuffer(program, cr_set, cb_config);
-        }
-
-        for (uint32_t j = 0; j < NUM_SEMS; j++) {
-            CreateSemaphore(program, cr_set, j + 1);
-        }
-
-        auto [brisc_unique_rtargs, brisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
-        uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size();
-        uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size();
-        vector<uint32_t> brisc_compile_args = {BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_brisc_unique_rtargs, num_brisc_common_rtargs, page_size};
-
-        // ncrisc
-        uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP;
-        if (i == 0) {
-            NCRISC_OUTER_LOOP = MAX_LOOP;
-            NCRISC_MIDDLE_LOOP = MAX_LOOP;
-            NCRISC_INNER_LOOP = MAX_LOOP;
-        } else {
-            NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
-            NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
-            NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
-        }
-
-        auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
-        uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size();
-        uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size();
-        vector<uint32_t> ncrisc_compile_args = {NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_ncrisc_unique_rtargs, num_ncrisc_common_rtargs, page_size};
-
-        // trisc
-        uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP;
-        if (i == 0) {
-            TRISC_OUTER_LOOP = MAX_LOOP;
-            TRISC_MIDDLE_LOOP = MAX_LOOP;
-            TRISC_INNER_LOOP = MAX_LOOP;
-        } else {
-            TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
-            TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
-            TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
-        }
-
-        auto [trisc_unique_rtargs, trisc_common_rtargs] = local_test_functions::create_runtime_args(USE_MAX_RT_ARGS);
-        uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size();
-        uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size();
-        vector<uint32_t> trisc_compile_args = {TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP, NUM_CBS, NUM_SEMS, num_trisc_unique_rtargs, num_trisc_common_rtargs, page_size};
-
-        bool at_least_one_kernel = false;
-        if (i == 0 or ((rand() % 2) == 0)) {
-            auto dummy_brisc_kernel = CreateKernel(
-                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
-                    .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines});
-            SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
-            SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
-            at_least_one_kernel = true;
-        }
-
-        if (i == 0 or ((rand() % 2) == 0)) {
-            auto dummy_ncrisc_kernel = CreateKernel(
-                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
-                    .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines});
-            SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
-            SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
-            at_least_one_kernel = true;
-        }
-
-        if (i == 0 or ((rand() % 2) == 0)) {
-            auto dummy_trisc_kernel = CreateKernel(
-                program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{
-                    .math_approx_mode = false,
-                    .compile_args = trisc_compile_args,
-                    .defines = compute_defines
-                });
-            SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
-            SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
-            at_least_one_kernel = true;
-        }
-
-        if (not at_least_one_kernel) {
-            uint32_t random_risc = rand() % 3 + 1;
-            if (random_risc == 1) {
-                auto dummy_brisc_kernel = CreateKernel(
-                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
-                        .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .compile_args = brisc_compile_args, .defines = data_movement_defines});
-                SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
-                SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
-            } else if (random_risc == 2) {
-                auto dummy_ncrisc_kernel = CreateKernel(
-                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, DataMovementConfig{
-                        .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = ncrisc_compile_args, .defines = data_movement_defines});
-                SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
-                SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
-            } else if (random_risc == 3) {
-                auto dummy_trisc_kernel = CreateKernel(
-                    program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp", cr_set, ComputeConfig{
-                        .math_approx_mode = false,
-                        .compile_args = trisc_compile_args,
-                        .defines = compute_defines
-                    });
-                SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
-                SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
-            } else {
-                TT_THROW("Invalid");
-            }
-        }
-
-        tt::tt_metal::detail::CompileProgram(this->device_, program);
-    }
-
-    for (uint8_t cq_id = 0; cq_id < this->device_->num_hw_cqs(); ++cq_id) {
-        log_info(tt::LogTest, "Running {} programs on cq {} for cache warmup.", programs.size(), (uint32_t)cq_id);
-        // This loop caches program and runs
-        for (Program& program: programs) {
-            EnqueueProgram(this->device_->command_queue(cq_id), program, false);
-        }
-
-        // This loops assumes already cached
-        uint32_t NUM_ITERATIONS = 500; // TODO(agrebenisan): Bump this to 5000, saw hangs for very large number of iterations, need to come back to that
-
-        log_info(tt::LogTest, "Running {} programs on cq {} for {} iterations now.", programs.size(), (uint32_t)cq_id, NUM_ITERATIONS);
-        for (uint32_t i = 0; i < NUM_ITERATIONS; i++) {
-            auto rng = std::default_random_engine {};
-            std::shuffle(std::begin(programs), std::end(programs), rng);
-            if (i % 10 == 0) {
-                log_debug(tt::LogTest, "Enqueueing {} programs on cq {} for iter: {}/{} now.", programs.size(), (uint32_t)cq_id, i+1, NUM_ITERATIONS);
-            }
-            for (Program& program: programs) {
-                EnqueueProgram(this->device_->command_queue(cq_id), program, false);
-            }
-        }
-
-        log_info(tt::LogTest, "Calling Finish.");
-        Finish(this->device_->command_queue(cq_id));
-    }
-}
-
-}  // namespace stress_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
index 0f9c35adb96..eb2894cdf75 100644
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
+++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp
@@ -38,8 +38,8 @@ Program create_simple_unary_program(const Buffer& input, const Buffer& output) {
             .compile_args = {1, 1},
             .defines = {{"SFPU_OP_EXP_INCLUDE", "1"}, {"SFPU_OP_CHAIN_0", "exp_tile_init(); exp_tile(0);"}}});
 
-    CircularBufferConfig input_cb_config = CircularBufferConfig(2048, {{0, tt::DataFormat::Float16_b}})
-            .set_page_size(0, 2048);
+    CircularBufferConfig input_cb_config = CircularBufferConfig(2048, {{tt::CBIndex::c_0, tt::DataFormat::Float16_b}})
+            .set_page_size(tt::CBIndex::c_0, 2048);
 
     CoreRange core_range({0, 0});
     CreateCircularBuffer(program, core_range, input_cb_config);
@@ -51,8 +51,8 @@ Program create_simple_unary_program(const Buffer& input, const Buffer& output) {
     };
     SetRuntimeArgs(program, writer_kernel, worker, writer_rt_args);
 
-    CircularBufferConfig output_cb_config = CircularBufferConfig(2048, {{16, tt::DataFormat::Float16_b}})
-            .set_page_size(16, 2048);
+    CircularBufferConfig output_cb_config = CircularBufferConfig(2048, {{tt::CBIndex::c_16, tt::DataFormat::Float16_b}})
+            .set_page_size(tt::CBIndex::c_16, 2048);
 
     CreateCircularBuffer(program, core_range, output_cb_config);
     vector<uint32_t> reader_rt_args = {
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
deleted file mode 100644
index 6932ab11955..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <memory>
-
-#include "command_queue_fixture.hpp"
-#include "command_queue_test_utils.hpp"
-#include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/device/device.hpp"
-
-using std::vector;
-using namespace tt::tt_metal;
-
-
-namespace local_test_functions {
-
-bool test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(Device* device, vector<std::reference_wrapper<CommandQueue>>& cqs, const TestBufferConfig& config) {
-    bool pass = true;
-    for (const bool use_void_star_api: {true, false}) {
-
-        size_t buf_size = config.num_pages * config.page_size;
-        std::vector<std::shared_ptr<Buffer>> buffers;
-        std::vector<std::vector<uint32_t>> srcs;
-        for (uint i = 0; i < cqs.size(); i++) {
-            buffers.push_back(Buffer::create(device, buf_size, config.page_size, config.buftype));
-            srcs.push_back(generate_arange_vector(buffers[i]->size()));
-            if (use_void_star_api) {
-                EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i].data(), false);
-            } else {
-                EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false);
-            }
-        }
-
-        for (uint i = 0; i < cqs.size(); i++) {
-            std::vector<uint32_t> result;
-            if (use_void_star_api) {
-                result.resize(buf_size / sizeof(uint32_t));
-                EnqueueReadBuffer(cqs[i], *buffers[i], result.data(), true);
-            } else {
-                EnqueueReadBuffer(cqs[i], *buffers[i], result, true);
-            }
-            bool local_pass = (srcs[i] == result);
-            pass &= local_pass;
-        }
-    }
-
-    return pass;
-}
-}
-
-
-namespace basic_tests {
-namespace dram_tests {
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToDramBank0) {
-    TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToAllDramBanks) {
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        TestBufferConfig config = {
-            .num_pages = uint32_t(device->num_banks(BufferType::DRAM)),
-            .page_size = 2048,
-            .buftype = BufferType::DRAM};
-
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
-    constexpr uint32_t num_round_robins = 2;
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        TestBufferConfig config = {
-            .num_pages = num_round_robins * (device->num_banks(BufferType::DRAM)),
-            .page_size = 2048,
-            .buftype = BufferType::DRAM};
-
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, Sending131072Pages) {
-    // Was a failing case where we used to accidentally program cb num pages to be total
-    // pages instead of cb num pages.
-    TestBufferConfig config = {
-        .num_pages = 131072,
-        .page_size = 128,
-        .buftype = BufferType::DRAM};
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForDram) {
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM};
-
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForDram2) {
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        // From stable diffusion read buffer
-        TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM};
-
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        uint32_t page_size = 2048;
-        uint32_t command_queue_size = device->sysmem_manager().get_cq_size();
-        uint32_t num_pages = command_queue_size / page_size;
-
-        TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};
-
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToDramBank0) {
-    TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::DRAM};
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllDramBanks) {
-    TestBufferConfig config = {
-        .num_pages = uint32_t(this->device_->num_banks(BufferType::DRAM)),
-        .page_size = 2048,
-        .buftype = BufferType::DRAM};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileAcrossAllDramBanksTwiceRoundRobin) {
-    constexpr uint32_t num_round_robins = 2;
-    TestBufferConfig config = {
-        .num_pages = num_round_robins * (this->device_->num_banks(BufferType::DRAM)),
-        .page_size = 2048,
-        .buftype = BufferType::DRAM};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, Sending131072Pages) {
-    // Was a failing case where we used to accidentally program cb num pages to be total
-    // pages instead of cb num pages.
-    TestBufferConfig config = {
-        .num_pages = 131072,
-        .page_size = 128,
-        .buftype = BufferType::DRAM};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForDram) {
-    TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::DRAM};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForDram2) {
-    // From stable diffusion read buffer
-    TestBufferConfig config = {.num_pages = 8 * 1024, .page_size = 80, .buftype = BufferType::DRAM};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestPageSizeTooLarge) {
-    if (this->arch_ == tt::ARCH::WORMHOLE_B0) {
-        GTEST_SKIP(); // This test hanging on wormhole b0
-    }
-    // Should throw a host error due to the page size not fitting in the consumer CB
-    TestBufferConfig config = {.num_pages = 1024, .page_size = 250880 * 2, .buftype = BufferType::DRAM};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_ANY_THROW(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestIssueMultipleReadWriteCommandsForOneBuffer) {
-    uint32_t page_size = 2048;
-    uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id());
-    uint32_t command_queue_size = tt::Cluster::instance().get_host_channel_size(this->device_->id(), channel);
-    uint32_t num_pages = command_queue_size / page_size;
-
-    TestBufferConfig config = {.num_pages = num_pages, .page_size = page_size, .buftype = BufferType::DRAM};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-
-}  // end namespace dram_tests
-
-namespace l1_tests {
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToL1Bank0) {
-    TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1};
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllL1Banks) {
-    auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size();
-    TestBufferConfig config = {
-        .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
-        .page_size = 2048,
-        .buftype = BufferType::L1};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
-    auto compute_with_storage_grid = this->device_->compute_with_storage_grid_size();
-    TestBufferConfig config = {
-        .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
-        .page_size = 2048,
-        .buftype = BufferType::L1};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueSingleDeviceFixture, TestNon32BAlignedPageSizeForL1) {
-    TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1};
-
-    CommandQueue& a = this->device_->command_queue(0);
-    CommandQueue& b = this->device_->command_queue(1);
-    vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-    EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(this->device_, cqs, config));
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToL1Bank0) {
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        TestBufferConfig config = {.num_pages = 1, .page_size = 2048, .buftype = BufferType::L1};
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToAllL1Banks) {
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        auto compute_with_storage_grid = device->compute_with_storage_grid_size();
-        TestBufferConfig config = {
-            .num_pages = uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
-            .page_size = 2048,
-            .buftype = BufferType::L1};
-
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, WriteOneTileToAllL1BanksTwiceRoundRobin) {
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        auto compute_with_storage_grid = device->compute_with_storage_grid_size();
-        TestBufferConfig config = {
-            .num_pages = 2 * uint32_t(compute_with_storage_grid.x * compute_with_storage_grid.y),
-            .page_size = 2048,
-            .buftype = BufferType::L1};
-
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-TEST_F(MultiCommandQueueMultiDeviceFixture, TestNon32BAlignedPageSizeForL1) {
-    for (Device *device : devices_) {
-        tt::log_info("Running On Device {}", device->id());
-        TestBufferConfig config = {.num_pages = 1250, .page_size = 200, .buftype = BufferType::L1};
-
-        CommandQueue& a = device->command_queue(0);
-        CommandQueue& b = device->command_queue(1);
-        vector<std::reference_wrapper<CommandQueue>> cqs = {a, b};
-        EXPECT_TRUE(local_test_functions::test_EnqueueWriteBuffer_and_EnqueueReadBuffer_multi_queue(device, cqs, config));
-    }
-}
-
-}  // end namespace l1_tests
-}  // end namespace basic_tests
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp
deleted file mode 100644
index b3efb0e4f16..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_fixture.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "gtest/gtest.h"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/test_utils/env_vars.hpp"
-#include "tt_metal/impl/dispatch/command_queue.hpp"
-#include "tt_metal/llrt/rtoptions.hpp"
-
-using namespace tt::tt_metal;
-
-class MultiCommandQueueSingleDeviceFixture : public ::testing::Test {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
-            GTEST_SKIP();
-        }
-        auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs();
-        if (num_cqs != 2) {
-            TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2");
-            GTEST_SKIP();
-        }
-        arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-        DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER;
-        if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) {
-            if (!tt::tt_metal::IsGalaxyCluster()) {
-                tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()");
-                dispatch_core_type = DispatchCoreType::ETH;
-            }
-        }
-        device_ = tt::tt_metal::CreateDevice(0, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-    }
-
-    void TearDown() override {
-        tt::tt_metal::CloseDevice(device_);
-    }
-
-    tt::tt_metal::Device* device_;
-    tt::ARCH arch_;
-};
-
-class MultiCommandQueueMultiDeviceFixture : public ::testing::Test {
-   protected:
-    void SetUp() override {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            TT_THROW("This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
-            GTEST_SKIP();
-        }
-        auto num_cqs = tt::llrt::OptionsG.get_num_hw_cqs();
-        if (num_cqs != 2) {
-            TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2");
-            GTEST_SKIP();
-        }
-        arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-
-
-        DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER;
-        if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() != 1) {
-            if (!tt::tt_metal::IsGalaxyCluster()) {
-                tt::log_warning(tt::LogTest, "Ethernet Dispatch not being explicitly used. Set this configuration in Setup()");
-                dispatch_core_type = DispatchCoreType::ETH;
-            }
-        }
-
-        const chip_id_t mmio_device_id = 0;
-        reserved_devices_ = tt::tt_metal::detail::CreateDevices({mmio_device_id}, num_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, dispatch_core_type);
-        for (const auto &[id, device] : reserved_devices_) {
-            devices_.push_back(device);
-        }
-
-        num_devices_ = reserved_devices_.size();
-    }
-
-    void TearDown() override { tt::tt_metal::detail::CloseDevices(reserved_devices_); }
-
-    std::vector<tt::tt_metal::Device*> devices_;
-    std::map<chip_id_t, tt::tt_metal::Device*> reserved_devices_;
-    size_t num_devices_;
-    tt::ARCH arch_;
-};
-
-
-class SingleDeviceTraceFixture: public ::testing::Test {
-protected:
-    Device* device_;
-    tt::ARCH arch_;
-
-    void Setup(const size_t buffer_size, const uint8_t num_hw_cqs = 1) {
-        auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE");
-        if (slow_dispatch) {
-            tt::log_info(tt::LogTest, "This suite can only be run with fast dispatch or TT_METAL_SLOW_DISPATCH_MODE unset");
-            GTEST_SKIP();
-        }
-        if (num_hw_cqs > 1) {
-            // Running multi-CQ test. User must set this explicitly.
-            auto num_cqs = getenv("TT_METAL_GTEST_NUM_HW_CQS");
-            if (num_cqs == nullptr or strcmp(num_cqs, "2")) {
-                TT_THROW("This suite must be run with TT_METAL_GTEST_NUM_HW_CQS=2");
-                GTEST_SKIP();
-            }
-        }
-        this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
-        const int device_id = 0;
-        this->device_ = tt::tt_metal::CreateDevice(device_id, num_hw_cqs, 0, buffer_size);;
-    }
-
-    void TearDown() override {
-        if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")) {
-            tt::tt_metal::CloseDevice(this->device_);
-        }
-    }
-
-};
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp
deleted file mode 100644
index e1e02ae6e16..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/common/command_queue_test_utils.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/common/bfloat16.hpp"
-
-struct TestBufferConfig {
-    uint32_t num_pages;
-    uint32_t page_size;
-    BufferType buftype;
-};
-
-struct BufferStressTestConfig {
-    // Used for normal write/read tests
-    uint32_t seed;
-    uint32_t num_pages_total;
-
-    uint32_t page_size;
-    uint32_t max_num_pages_per_buffer;
-
-    // Used for wrap test
-    uint32_t num_iterations;
-    uint32_t num_unique_vectors;
-};
-
-
-inline std::vector<uint32_t> generate_arange_vector(uint32_t size_bytes, uint32_t start = 0) {
-    TT_FATAL(size_bytes % sizeof(uint32_t) == 0, "Error");
-    std::vector<uint32_t> src(size_bytes / sizeof(uint32_t), 0);
-
-    for (uint32_t i = 0; i < src.size(); i++) {
-        src.at(i) = start + i;
-    }
-    return src;
-}
diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp
deleted file mode 100644
index 1e42f41a46c..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/tests_main.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "gtest/gtest.h"
diff --git a/tests/tt_metal/tt_metal/unit_tests_frequent/CMakeLists.txt b/tests/tt_metal/tt_metal/unit_tests_frequent/CMakeLists.txt
deleted file mode 100644
index 8cdca979930..00000000000
--- a/tests/tt_metal/tt_metal/unit_tests_frequent/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-set(UNIT_TESTS_FREQUENT_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/tests/run_many_times.cpp)
-
-add_executable(unit_tests_frequent ${UNIT_TESTS_FREQUENT_SRCS})
-
-target_link_libraries(
-    unit_tests_frequent
-    PUBLIC
-        test_metal_common_libs
-        gtest
-        gtest_main
-)
-target_include_directories(
-    unit_tests_frequent
-    PRIVATE
-        ${PROJECT_SOURCE_DIR}
-        ${PROJECT_SOURCE_DIR}/tt_metal
-        ${PROJECT_SOURCE_DIR}/tests
-        ${CMAKE_CURRENT_SOURCE_DIR}
-        ${CMAKE_CURRENT_SOURCE_DIR}/common
-)
-set_target_properties(
-    unit_tests_frequent
-    PROPERTIES
-        RUNTIME_OUTPUT_DIRECTORY
-            ${PROJECT_BINARY_DIR}/test/tt_metal
-)
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_receiver_worker_reader.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_receiver_worker_reader.cpp
index 481457ac8cd..14c111fdf10 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_receiver_worker_reader.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_receiver_worker_reader.cpp
@@ -31,7 +31,7 @@ void kernel_main() {
         num_pages_per_read_chunk * page_size,
         receiver_read_sem_addr);
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     for (uint32_t i = 0; i < total_pages_to_read; i += num_pages_per_read_chunk) {
         bool last_message = (i + num_pages_per_read_chunk) >= total_pages_to_read;
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_receiver_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_receiver_worker_sender.cpp
index fecb458407b..90b0164421a 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_receiver_worker_sender.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_receiver_worker_sender.cpp
@@ -12,7 +12,7 @@ void kernel_main() {
     constexpr uint32_t page_size = get_compile_time_arg_val(2);
     constexpr uint32_t pages_per_edm_buffer = get_compile_time_arg_val(3);
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
     InterleavedAddrGen<dst_is_dram> dest_addr_generator = {
         .bank_base_address = dst_addr, .page_size = page_size};
 
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_reader.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_reader.cpp
index 66662d02630..76bb4668df5 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_reader.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_reader.cpp
@@ -11,7 +11,7 @@ void kernel_main() {
     constexpr uint32_t num_pages_to_read_total = get_compile_time_arg_val(1);
     constexpr uint32_t page_size = get_compile_time_arg_val(2);
     constexpr uint32_t pages_per_edm_buffer = get_compile_time_arg_val(3);
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     const uint32_t src_addr = get_arg_val<uint32_t>(0);
 
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_sender.cpp
index 4cff4c2ec51..73d1e1dc326 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_sender.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/erisc_datamover_sender_worker_sender.cpp
@@ -45,7 +45,7 @@ void kernel_main() {
     }
 
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
 
     uint32_t buffer_index = 0;
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp
index 3437c819346..ebd0324a656 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp
@@ -12,7 +12,7 @@ void kernel_main() {
     constexpr uint32_t num_pages_to_read_total = get_compile_time_arg_val(1);
     constexpr uint32_t page_size = get_compile_time_arg_val(2);
     constexpr uint32_t pages_per_edm_buffer = 1;
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     const uint32_t src_addr = get_arg_val<uint32_t>(0);
 
diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
index babcd41c992..8d5d354ef45 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp
@@ -97,7 +97,7 @@ void kernel_main() {
 
     sender.open();
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     // We need to normalize all noc addresses to be for a consistent noc ID
     // so the remote sender core can correctly send the packet. In the future
diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
index 93ffa0d2fcf..88b1be39922 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/test_erisc_data_mover_with_workers.cpp
@@ -126,7 +126,7 @@ void generate_receiver_worker_kernels(
     ttnn::ccl::EriscDataMoverTerminationMode edm_termination_mode
 ) {
     // Just want a dummy DF
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     tt::DataFormat df = page_size == 1024 ? tt::DataFormat::Bfp8 :
                         page_size == 2048 ? tt::DataFormat::Float16 :
                                                          tt::DataFormat::Float32;
@@ -248,7 +248,7 @@ void generate_sender_worker_kernels(
         (uint32_t)device->ethernet_core_from_logical_core(edm_core).y,
         num_buffers_per_edm_channel
     };
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     log_info(tt::LogTest, "\tSenderWriter CT Args");
     for (auto const& arg : sender_worker_writer_compile_args) {
         log_info(tt::LogTest, "\t\t{}", arg);
diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
index 4b87efc5182..42fe6db7f17 100644
--- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
+++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp
@@ -281,7 +281,7 @@ void generate_sender_worker_kernels(
             info.termination_addr);
     }
 
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     log_trace(tt::LogTest, "\tSenderWriter CT Args");
     for (auto const& arg : sender_worker_writer_compile_args) {
         log_trace(tt::LogTest, "\t\t{}", arg);
diff --git a/tests/ttnn/unit_tests/gtests/test_add.cpp b/tests/ttnn/unit_tests/gtests/test_add.cpp
index c1be54118a6..7b0be8728ac 100644
--- a/tests/ttnn/unit_tests/gtests/test_add.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_add.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
 #include "ttnn/device.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/core/core.hpp"
diff --git a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
index 311639585bc..a03ed11549e 100644
--- a/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_graph_add.cpp
@@ -4,7 +4,7 @@
 
 #include "gtest/gtest.h"
 #include "tt_metal/common/logger.hpp"
-#include "tests/tt_metal/tt_metal/unit_tests_common/common/common_fixture.hpp"
+#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
 #include "ttnn/device.hpp"
 #include "ttnn/operations/eltwise/binary/binary.hpp"
 #include "ttnn/operations/core/core.hpp"
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
index 1429eb0fce1..800d25befb8 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
@@ -141,7 +141,7 @@ def test_all_gather_on_t3000(
     ],
 )
 @pytest.mark.parametrize(
-    "per_chip_output_shape, scatter_dim, layout",
+    "per_chip_output_shape, dim, layout",
     [
         ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT),
         ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT),
@@ -171,7 +171,7 @@ def test_reduce_scatter_on_t3000(
     t3k_mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -187,7 +187,7 @@ def test_reduce_scatter_on_t3000(
         t3k_mesh_device,
         num_devices,
         per_chip_output_shape,
-        scatter_dim,
+        dim,
         num_links,
         math_op,
         input_dtype,
@@ -210,7 +210,7 @@ def test_reduce_scatter_on_t3000(
     ],
 )
 @pytest.mark.parametrize(
-    "per_chip_output_shape, scatter_dim, layout",
+    "per_chip_output_shape, dim, layout",
     [
         ([1, 1, 32, 4096], 3, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT),
@@ -239,7 +239,7 @@ def test_reduce_scatter_on_n300(
     n300_mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -254,7 +254,7 @@ def test_reduce_scatter_on_n300(
         n300_mesh_device,
         num_devices,
         per_chip_output_shape,
-        scatter_dim,
+        dim,
         num_links,
         math_op,
         input_dtype,
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_nightly.py
index ad1d7a63abe..3313d73880c 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_nightly.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_nightly.py
@@ -22,15 +22,15 @@
     [
         (4, 1, [4, 1, 33, 256], 0, ttnn.ROW_MAJOR_LAYOUT),
         (8, 1, [8, 1, 33, 256], 0, ttnn.ROW_MAJOR_LAYOUT),
-        (8, 1, [8, 1, 256, 32], 0, ttnn.TILE_LAYOUT),
+        (8, 1, [8, 1, 256, 32], -4, ttnn.TILE_LAYOUT),
         (8, 1, [8, 8, 256, 384], 1, ttnn.ROW_MAJOR_LAYOUT),
         # (4, 2, [8, 8, 256, 384], 1, ttnn.TILE_LAYOUT),
         (8, 1, [8, 8, 256, 384], 1, ttnn.TILE_LAYOUT),
-        (4, 1, [8, 5, 13, 384], 3, ttnn.ROW_MAJOR_LAYOUT),
-        (8, 1, [8, 5, 13, 512], 3, ttnn.ROW_MAJOR_LAYOUT),
+        (4, 1, [8, 5, 13, 384], -1, ttnn.ROW_MAJOR_LAYOUT),
+        (8, 1, [8, 5, 13, 512], -1, ttnn.ROW_MAJOR_LAYOUT),
         (4, 1, [8, 5, 32, 384], 3, ttnn.TILE_LAYOUT),
         (8, 1, [8, 5, 32, 512], 3, ttnn.TILE_LAYOUT),
-        (4, 1, [1, 1, 32, 16384], 3, ttnn.TILE_LAYOUT),
+        (4, 1, [1, 1, 32, 16384], -1, ttnn.TILE_LAYOUT),
     ],
 )
 @pytest.mark.parametrize(
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_N300_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_N300_post_commit.py
index c34c4fd6191..086efb1d534 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_N300_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_N300_post_commit.py
@@ -20,7 +20,7 @@
     ],
 )
 @pytest.mark.parametrize(
-    "per_chip_output_shape, scatter_dim, layout",
+    "per_chip_output_shape, dim, layout",
     [
         ([1, 1, 32, 4096], 3, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT),
@@ -50,7 +50,7 @@ def test_ring_reduce_scatter_n300_post_commit(
     n300_mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -65,7 +65,7 @@ def test_ring_reduce_scatter_n300_post_commit(
         n300_mesh_device,
         num_devices,
         per_chip_output_shape,
-        scatter_dim,
+        dim,
         num_links,
         math_op,
         input_dtype,
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
index 9e9fbf479f5..1b5bfe8f672 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py
@@ -145,7 +145,7 @@ def run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
     # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor)
     ttnn_tensor_out = ttnn.reduce_scatter(
         ttnn_tensor,
-        scatter_dim=dim,
+        dim=dim,
         cluster_axis=cluster_axis,
         mesh_device=mesh_device,
         math_op=math_op,
@@ -158,7 +158,7 @@ def run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
     for _ in range(num_iters):
         ttnn_tensor_out = ttnn.reduce_scatter(
             ttnn_tensor,
-            scatter_dim=dim,
+            dim=dim,
             cluster_axis=cluster_axis,
             mesh_device=mesh_device,
             math_op=math_op,
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_nightly.py
index 17eee107972..aaf8e21fc10 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_nightly.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_nightly.py
@@ -19,23 +19,23 @@
     ],
 )
 @pytest.mark.parametrize(
-    "per_chip_output_shape, scatter_dim, layout",
+    "per_chip_output_shape, dim, layout",
     [
         ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT),
         ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT),
         ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 32, 32], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 32, 64], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 32], -1, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 64], -1, ttnn.TILE_LAYOUT),
         ([1, 1, 64, 64], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 32, 128], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 128], -1, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 256], 3, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 512], 3, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 1024], 3, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 128, 1024], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 128, 1024], -1, ttnn.TILE_LAYOUT),
         ([1, 1, 128, 8192], 3, ttnn.TILE_LAYOUT),
         ([1, 1, 2048, 1024], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 2048, 8192], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 2048, 8192], -1, ttnn.TILE_LAYOUT),
     ],
 )
 @pytest.mark.parametrize(
@@ -58,7 +58,7 @@ def test_reduce_scatter_t3k_8chip_nightly(
     t3k_mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -73,7 +73,7 @@ def test_reduce_scatter_t3k_8chip_nightly(
         t3k_mesh_device,
         num_devices,
         per_chip_output_shape,
-        scatter_dim,
+        dim,
         num_links,
         math_op,
         input_dtype,
@@ -95,16 +95,16 @@ def test_reduce_scatter_t3k_8chip_nightly(
     ],
 )
 @pytest.mark.parametrize(
-    "per_chip_output_shape, scatter_dim, layout",
+    "per_chip_output_shape, dim, layout",
     [
         ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT),
         ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT),
-        ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT),
+        ([1, 4, 2048, 1024], -1, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 512], 3, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 1024], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 2048], -1, ttnn.TILE_LAYOUT),
         ([1, 1, 128, 1024], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 128, 8192], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 128, 8192], -1, ttnn.TILE_LAYOUT),
         ([1, 1, 2048, 1024], 3, ttnn.TILE_LAYOUT),
         ([1, 1, 2048, 8192], 3, ttnn.TILE_LAYOUT),
         # These shapes result in some workers with no work, which is currently
@@ -136,7 +136,7 @@ def test_reduce_scatter_t3k_4chip_nightly(
     pcie_mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -151,7 +151,7 @@ def test_reduce_scatter_t3k_4chip_nightly(
         pcie_mesh_device,
         num_devices,
         per_chip_output_shape,
-        scatter_dim,
+        dim,
         num_links,
         math_op,
         input_dtype,
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
index 916682dd84e..4efe5152448 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
@@ -10,7 +10,7 @@
 from models.utility_functions import skip_for_grayskull
 
 
-def is_unsupported_case(input_shape, scatter_dim, math_op, mem_config, num_devices, num_links, input_dtype, layout):
+def is_unsupported_case(input_shape, dim, math_op, mem_config, num_devices, num_links, input_dtype, layout):
     elem_size = 2 if input_dtype == ttnn.bfloat16 else 1
     tensor_size_bytes = elem_size
     for i in input_shape:
@@ -19,7 +19,7 @@ def is_unsupported_case(input_shape, scatter_dim, math_op, mem_config, num_devic
     if mem_config.buffer_type == ttnn.BufferType.L1 and tensor_size_bytes > num_l1_banks * 50 * 1024:
         return True, "L1 buffer can't support large tensor sizes"
 
-    # if input_dtype == ttnn.bfloat8_b and tuple(input_shape) == (1, 1, 2048, 1024) and scatter_dim == 3:
+    # if input_dtype == ttnn.bfloat8_b and tuple(input_shape) == (1, 1, 2048, 1024) and dim == 3:
     #     return True, "Known failure with bfp8_b data format"
 
     return False, ""
@@ -28,7 +28,7 @@ def is_unsupported_case(input_shape, scatter_dim, math_op, mem_config, num_devic
 def run_with_trace(
     t3k_mesh_device,
     input_tensor_mesh,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     output_mem_config,
@@ -41,7 +41,7 @@ def run_with_trace(
     logger.info("Compiling model")
     output_tensor_mesh = ttnn.reduce_scatter(
         input_tensor_mesh,
-        scatter_dim=scatter_dim,
+        dim=dim,
         math_op=math_op,
         num_links=num_links,
         memory_config=output_mem_config,
@@ -58,7 +58,7 @@ def run_with_trace(
     for i in range(num_iters):
         output_tensor_mesh = ttnn.reduce_scatter(
             input_tensor_mesh,
-            scatter_dim=scatter_dim,
+            dim=dim,
             math_op=math_op,
             num_links=num_links,
             memory_config=output_mem_config,
@@ -84,7 +84,7 @@ def run_reduce_scatter_test(
     mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -105,7 +105,7 @@ def run_reduce_scatter_test(
     debug = False
 
     (is_known_failure, message) = is_unsupported_case(
-        per_chip_output_shape, scatter_dim, math_op, mem_config, num_devices, num_links, input_dtype, layout
+        per_chip_output_shape, dim, math_op, mem_config, num_devices, num_links, input_dtype, layout
     )
     if is_known_failure:
         pytest.skip(f"Skipping unsupported case {message}.")
@@ -114,11 +114,11 @@ def run_reduce_scatter_test(
     if enable_async:
         logger.info(f"Using Async Mode for Reduce Scatter Op Dispatch")
 
-    logger.info(f"Per chip output shape: {per_chip_output_shape}, devices: {num_devices}, scatter_dim: {scatter_dim}")
+    logger.info(f"Per chip output shape: {per_chip_output_shape}, devices: {num_devices}, dim: {dim}")
 
     # Generate input tensors
     canonical_input_shape = per_chip_output_shape.copy()
-    canonical_input_shape[scatter_dim] *= num_devices
+    canonical_input_shape[dim] *= num_devices
     tt_input_tensors = []
 
     numel = canonical_input_shape[0] * canonical_input_shape[1] * canonical_input_shape[2] * canonical_input_shape[3]
@@ -143,7 +143,7 @@ def run_reduce_scatter_test(
         output_tensor_mesh = run_with_trace(
             mesh_device,
             input_tensor_mesh,
-            scatter_dim,
+            dim,
             num_links,
             math_op,
             mem_config,
@@ -154,7 +154,7 @@ def run_reduce_scatter_test(
         for i in range(num_iters):
             output_tensor_mesh = ttnn.reduce_scatter(
                 input_tensor_mesh,
-                scatter_dim=scatter_dim,
+                dim=dim,
                 math_op=math_op,
                 num_links=num_links,
                 memory_config=mem_config,
@@ -172,7 +172,7 @@ def run_reduce_scatter_test(
     for i, t in enumerate(input_tensors):
         golden_canonical_out_tensor = torch.add(golden_canonical_out_tensor, t).bfloat16()
 
-    golden_output_tensors = torch.chunk(golden_canonical_out_tensor, num_devices, scatter_dim)
+    golden_output_tensors = torch.chunk(golden_canonical_out_tensor, num_devices, dim)
 
     tt_out_tensors = ttnn.get_device_tensors(output_tensor_mesh)
     logger.info(f"Compare")
@@ -211,7 +211,7 @@ def run_reduce_scatter_test(
     ],
 )
 @pytest.mark.parametrize(
-    "per_chip_output_shape, scatter_dim, layout",
+    "per_chip_output_shape, dim, layout",
     [
         ([1, 2, 256, 32 * 8], 3, ttnn.TILE_LAYOUT),  # Input tensor is (16*32) x (64*32) = 8 * input tensor shape
         ([1, 1, 32, 32 * 8], 3, ttnn.TILE_LAYOUT),
@@ -241,7 +241,7 @@ def test_ring_reduce_scatter_post_commit(
     t3k_mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -256,7 +256,7 @@ def test_ring_reduce_scatter_post_commit(
         t3k_mesh_device,
         num_devices,
         per_chip_output_shape,
-        scatter_dim,
+        dim,
         num_links,
         math_op,
         input_dtype,
@@ -279,7 +279,7 @@ def test_ring_reduce_scatter_post_commit(
     ],
 )
 @pytest.mark.parametrize(
-    "per_chip_output_shape, scatter_dim, layout",
+    "per_chip_output_shape, dim, layout",
     [
         ([1, 1, 32, 32 * 8], 3, ttnn.TILE_LAYOUT),
         ([1, 2, 224, 32 * 8], 3, ttnn.TILE_LAYOUT),
@@ -306,7 +306,7 @@ def test_line_reduce_scatter_post_commit(
     t3k_mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -321,7 +321,7 @@ def test_line_reduce_scatter_post_commit(
         t3k_mesh_device,
         num_devices,
         per_chip_output_shape,
-        scatter_dim,
+        dim,
         num_links,
         math_op,
         input_dtype,
@@ -345,7 +345,7 @@ def test_line_reduce_scatter_post_commit(
     ],
 )
 @pytest.mark.parametrize(
-    "per_chip_output_shape, scatter_dim, layout",
+    "per_chip_output_shape, dim, layout",
     [
         ([1, 1, 32, 1280], 1, ttnn.TILE_LAYOUT),
         ([1, 1, 32, 1024], 1, ttnn.TILE_LAYOUT),
@@ -369,7 +369,7 @@ def test_line_reduce_scatter_post_commit_4chip(
     pcie_mesh_device,
     num_devices,
     per_chip_output_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     input_dtype,
@@ -384,7 +384,7 @@ def test_line_reduce_scatter_post_commit_4chip(
         pcie_mesh_device,
         num_devices,
         per_chip_output_shape,
-        scatter_dim,
+        dim,
         num_links,
         math_op,
         input_dtype,
@@ -403,7 +403,7 @@ def run_reduce_scatter_sharded_test(
     num_devices,
     per_chip_output_shape,
     output_shard_shape,
-    scatter_dim,
+    dim,
     num_links,
     math_op,
     shard_grid,
@@ -427,7 +427,7 @@ def run_reduce_scatter_sharded_test(
             f"Not enough devices on machine to implement test case. Wanted {num_devices} but found {len(t3k_mesh_device.get_device_ids())}"
         )
 
-    logger.info(f"Per chip output shape: {per_chip_output_shape}, devices: {num_devices}, scatter_dim: {scatter_dim}")
+    logger.info(f"Per chip output shape: {per_chip_output_shape}, devices: {num_devices}, dim: {dim}")
 
     debug = False
 
@@ -438,7 +438,7 @@ def run_reduce_scatter_sharded_test(
         assert in_shard_override is None
         in_shard_grid = shard_grid
         input_shard_shape = list(output_shard_shape)
-        if scatter_dim == 3:
+        if dim == 3:
             input_shard_shape[1] *= num_devices
         else:
             input_shard_shape[0] *= num_devices
@@ -468,7 +468,7 @@ def run_reduce_scatter_sharded_test(
     )
 
     canonical_input_shape = list(per_chip_output_shape)
-    canonical_input_shape[scatter_dim] *= num_devices
+    canonical_input_shape[dim] *= num_devices
 
     numel = canonical_input_shape[0] * canonical_input_shape[1] * canonical_input_shape[2] * canonical_input_shape[3]
     input_tensors = [
@@ -492,7 +492,7 @@ def run_reduce_scatter_sharded_test(
         output_tensor_mesh = run_with_trace(
             t3k_mesh_device,
             input_tensor_mesh,
-            scatter_dim,
+            dim,
             num_links,
             math_op,
             output_mem_config,
@@ -504,7 +504,7 @@ def run_reduce_scatter_sharded_test(
         for i in range(num_iters):
             output_tensor_mesh = ttnn.reduce_scatter(
                 input_tensor_mesh,
-                scatter_dim=scatter_dim,
+                dim=dim,
                 math_op=math_op,
                 num_links=num_links,
                 memory_config=output_mem_config,
@@ -521,7 +521,7 @@ def run_reduce_scatter_sharded_test(
     for i, t in enumerate(input_tensors):
         golden_canonical_out_tensor = torch.add(golden_canonical_out_tensor, t).bfloat16()
 
-    golden_output_tensors = torch.chunk(golden_canonical_out_tensor, num_devices, scatter_dim)
+    golden_output_tensors = torch.chunk(golden_canonical_out_tensor, num_devices, dim)
 
     tt_out_tensors = ttnn.get_device_tensors(output_tensor_mesh)
     logger.info(f"Compare")
diff --git a/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rdiv.py b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rdiv.py
index e00b9da6eed..dc059035382 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rdiv.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/backward/test_backward_rdiv.py
@@ -19,7 +19,7 @@
 @pytest.mark.parametrize(
     "round_mode",
     (
-        "None",
+        None,
         "trunc",
         "floor",
     ),
@@ -31,10 +31,6 @@ def test_bw_rdiv(input_shapes, scalar, round_mode, device):
 
     tt_output_tensor_on_device = ttnn.rdiv_bw(grad_tensor, input_tensor, scalar, round_mode=round_mode)
 
-    in_data.retain_grad()
-
-    if round_mode == "None":
-        round_mode = None
     golden_function = ttnn.get_golden_function(ttnn.rdiv_bw)
     golden_tensor = golden_function(grad_data, in_data, scalar, round_mode)
 
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
index ac1f2f1775f..831944b77a0 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py
@@ -1053,3 +1053,40 @@ def test_binary_prelu_scalar_ttnn(input_shapes, scalar, device):
     golden_tensor = golden_function(in_data1, scalar)
 
     assert_with_pcc(golden_tensor, output_tensor, 0.999)
+
+
+@pytest.mark.parametrize(
+    "input_shapes",
+    (
+        (torch.Size([1, 2, 32, 64, 64])),
+        (torch.Size([1, 3, 7, 29, 127])),
+        (torch.Size([1, 3, 2, 32])),
+        (torch.Size([1, 6, 49, 97])),
+        (torch.Size([1, 7, 320])),
+        (torch.Size([1, 49, 321])),
+        (torch.Size([4, 32])),
+        (torch.Size([49, 321])),
+    ),
+)
+@pytest.mark.parametrize(
+    "weight",
+    [
+        [-0.25],
+        [-2.7],
+        [0.45],
+        [6.4],
+        [2],
+        [-1],
+    ],
+)
+@skip_for_grayskull()
+def test_binary_prelu_1D_weight(input_shapes, weight, device):
+    in_data1 = torch.rand(input_shapes, dtype=torch.bfloat16) * 200 - 100
+    input_tensor1 = ttnn.from_torch(in_data1, layout=ttnn.TILE_LAYOUT, device=device)
+
+    output_tensor = ttnn.prelu(input_tensor1, weight)
+    output_tensor = ttnn.to_torch(output_tensor)
+    golden_function = ttnn.get_golden_function(ttnn.prelu)
+    golden_tensor = golden_function(in_data1, weight)
+
+    assert_with_pcc(golden_tensor, output_tensor, 0.999)
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_mul.py b/tests/ttnn/unit_tests/operations/eltwise/test_mul.py
index 2227226f1c4..e82560cd941 100644
--- a/tests/ttnn/unit_tests/operations/eltwise/test_mul.py
+++ b/tests/ttnn/unit_tests/operations/eltwise/test_mul.py
@@ -97,3 +97,22 @@ def test_multiply_int32_with_scalar(device, input_a, scalar):
     output = ttnn.to_torch(output)
 
     assert_with_pcc(torch_output_tensor, output, 0.9999)
+
+
+#  #14840: use DRAM config
+@pytest.mark.parametrize("output_memory_config", [ttnn.DRAM_MEMORY_CONFIG])
+@pytest.mark.parametrize("scalar", [0.125])
+@pytest.mark.parametrize("batch_size", [6, 7, 8])
+def test_multiply_with_scalar_sharded(device, scalar, batch_size, output_memory_config):
+    torch.manual_seed(0)
+    torch_input_tensor_a = torch.rand((batch_size, 16, 384, 384), dtype=torch.float32)
+    torch_output_tensor = scalar * torch_input_tensor_a
+
+    # GS has smaller L1 than WH
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, memory_config=ttnn.DRAM_MEMORY_CONFIG, device=device
+    )
+    output = ttnn.mul(input_tensor_a, scalar, memory_config=output_memory_config)
+    output = ttnn.to_torch(output)
+
+    assert_with_pcc(torch_output_tensor, output, 0.9999)
diff --git a/tests/ttnn/unit_tests/operations/test_matmul.py b/tests/ttnn/unit_tests/operations/test_matmul.py
index b85e5319e23..6f56e7fb6be 100644
--- a/tests/ttnn/unit_tests/operations/test_matmul.py
+++ b/tests/ttnn/unit_tests/operations/test_matmul.py
@@ -81,6 +81,18 @@ def test_tiny_tiles(device, n, c, h, w, tile_h, tile_w):
     assert_with_pcc(torch_input_tensor, output_tensor, 1)
 
 
+@pytest.mark.parametrize("m, k, n", [(784, 192, 576), (576, 192, 784), (486, 792, 352), (966, 123, 561)])
+def test_pytorch_2_0_failed_cases(device, m, k, n):
+    x = torch.ones((m, k), dtype=torch.float32)
+    y = torch.ones((k, n), dtype=torch.float32)
+    x_tt = ttnn.from_torch(x, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+    y_tt = ttnn.from_torch(y, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+    z_tt = ttnn.matmul(x_tt, y_tt)
+    z = ttnn.to_torch(z_tt)
+    z_t = torch.matmul(x, y)
+    assert_with_pcc(z_t, z)
+
+
 @run_for_wormhole_b0()
 @pytest.mark.parametrize("b", [2])
 @pytest.mark.parametrize("h", [3])
diff --git a/tests/ttnn/unit_tests/operations/test_moreh_norm.py b/tests/ttnn/unit_tests/operations/test_moreh_norm.py
index df2038ed375..8526d30d5d0 100644
--- a/tests/ttnn/unit_tests/operations/test_moreh_norm.py
+++ b/tests/ttnn/unit_tests/operations/test_moreh_norm.py
@@ -38,7 +38,9 @@ def make_torch_tensors(input_shape, dim, keepdim=False, *, dtype=torch.float32):
     return torch_input, torch_output_grad
 
 
-def torch_norm(torch_input, torch_output_grad, *, p=2.0, dim=None, keepdim=False, do_backward=False):
+def torch_norm(
+    torch_input, torch_output_grad, *, p=2.0, dim=None, keepdim=False, is_linalg_vector_norm=False, do_backward=False
+):
     """
     Computes the norm of a tensor using torch and optionally performs backpropagation.
 
@@ -55,7 +57,10 @@ def torch_norm(torch_input, torch_output_grad, *, p=2.0, dim=None, keepdim=False
             - `torch_output`: The result of the norm operation.
             - `torch_input_grad`: The gradient of the input tensor (if `do_backward=True`), otherwise None.
     """
-    torch_output = torch.norm(torch_input, p=p, dim=dim, keepdim=keepdim)
+    if is_linalg_vector_norm:
+        torch_output = torch.linalg.vector_norm(torch_input, ord=p, dim=dim, keepdim=keepdim)
+    else:
+        torch_output = torch.norm(torch_input, p=p, dim=dim, keepdim=keepdim)
     torch_input_grad = None
     if do_backward:
         torch_output.backward(torch_output_grad)
@@ -74,6 +79,7 @@ def ttnn_norm(
     do_backward=False,
     device=None,
     dtype=ttnn.bfloat16,
+    is_linalg_vector_norm=False,
 ):
     """
     Computes the norm of a tensor using ttnn's custom backend and optionally performs backpropagation.
@@ -96,7 +102,10 @@ def ttnn_norm(
     _, ttnn_output_shape = compute_output_shape(torch_input.shape, dim, keepdim=keepdim)
     ttnn_input = create_ttnn_tilized_tensor(torch_input, device, dtype)
     if do_backward:
-        torch_output = torch.norm(torch_input, p=p, dim=dim, keepdim=keepdim)
+        if is_linalg_vector_norm:
+            torch_output = torch.linalg.vector_norm(torch_input, ord=p, dim=dim, keepdim=keepdim)
+        else:
+            torch_output = torch.norm(torch_input, p=p, dim=dim, keepdim=keepdim)
         ttnn_output = create_ttnn_tilized_tensor(torch_output, device, dtype)
     else:
         ttnn_output = create_ttnn_tilized_tensor(torch.empty(ttnn_output_shape), device, dtype)
@@ -139,6 +148,7 @@ def run_moreh_norm(
     compute_kernel_options=None,
     torch_dtype=torch.float32,
     ttnn_dtype=ttnn.bfloat16,
+    is_linalg_vector_norm=False,
 ):
     """
     Runs the norm operation using both torch and ttnn's implementation and compares the outputs.
@@ -161,7 +171,15 @@ def run_moreh_norm(
         pytest.skip(f"bfloat8_b is not supported in the kernel")
     check_dim(input_shape, dim, keepdim)
     torch_input, torch_output_grad = make_torch_tensors(input_shape, dim, keepdim=keepdim, dtype=torch_dtype)
-    expected_output, _ = torch_norm(torch_input, torch_output_grad, p=p, dim=dim, keepdim=keepdim, do_backward=False)
+    expected_output, _ = torch_norm(
+        torch_input,
+        torch_output_grad,
+        p=p,
+        dim=dim,
+        keepdim=keepdim,
+        is_linalg_vector_norm=is_linalg_vector_norm,
+        do_backward=False,
+    )
     actual_output, _ = ttnn_norm(
         torch_input,
         torch_output_grad,
@@ -172,6 +190,7 @@ def run_moreh_norm(
         device=device,
         do_backward=False,
         dtype=ttnn_dtype,
+        is_linalg_vector_norm=is_linalg_vector_norm,
     )
     passing, out = comp_allclose(expected_output, actual_output, rtol=rtol, atol=atol)
     logger.info(f"output's {out}")
@@ -189,6 +208,7 @@ def run_moreh_norm_backward(
     compute_kernel_options=None,
     torch_dtype=torch.float32,
     ttnn_dtype=ttnn.bfloat16,
+    is_linalg_vector_norm=False,
 ):
     """
     Runs the norm operation with backpropagation using both torch and ttnn's custom implementation and compares the gradients.
@@ -211,7 +231,15 @@ def run_moreh_norm_backward(
         pytest.skip(f"bfloat8_b is not supported in the kernel")
     check_dim(input_shape, dim, keepdim)
     torch_input, torch_output_grad = make_torch_tensors(input_shape, dim, keepdim=keepdim, dtype=torch_dtype)
-    _, expected_input_grad = torch_norm(torch_input, torch_output_grad, p=p, dim=dim, keepdim=keepdim, do_backward=True)
+    _, expected_input_grad = torch_norm(
+        torch_input,
+        torch_output_grad,
+        p=p,
+        dim=dim,
+        keepdim=keepdim,
+        is_linalg_vector_norm=is_linalg_vector_norm,
+        do_backward=True,
+    )
     _, actual_input_grad = ttnn_norm(
         torch_input,
         torch_output_grad,
@@ -222,31 +250,32 @@ def run_moreh_norm_backward(
         device=device,
         do_backward=True,
         dtype=ttnn_dtype,
+        is_linalg_vector_norm=is_linalg_vector_norm,
     )
     passing, out = comp_allclose(expected_input_grad, actual_input_grad, rtol=rtol, atol=atol)
     logger.info(f"input_grad's {out}")
     assert passing
 
 
-@pytest.mark.parametrize("p", [2.0, 2.5, -2.5])
+@pytest.mark.parametrize("p", [2.0, 2.5, -2.5, 0.0, float("inf"), float("-inf")])
 @pytest.mark.parametrize(
     "dim_rtol_atol",
     [
-        [[], 0.2, 0.2],
-        [None, 0.2, 0.2],
-        [0, 0.1, 0.1],
-        [1, 0.1, 0.1],
-        [2, 0.1, 0.1],
-        [3, 0.1, 0.1],
-        [[0, 1], 0.1, 0.1],
-        [[0, 1, 2], 0.15, 0.15],
-        [[0, 1, 2, 3], 0.2, 0.2],
-        [[0, 1, 3], 0.15, 0.15],
-        [[0, 2, 3], 0.15, 0.15],
-        [[1, 2], 0.1, 0.1],
-        [[1, 2, 3], 0.15, 0.15],
-        [[1, 3], 0.1, 0.1],
-        [[2, 3], 0.1, 0.1],
+        [[], 0.06, 0.06],
+        [None, 0.06, 0.06],
+        [0, 0.06, 0.06],
+        [1, 0.06, 0.06],
+        [2, 0.06, 0.06],
+        [3, 0.06, 0.06],
+        [[0, 1], 0.06, 0.06],
+        [[0, 1, 2], 0.06, 0.06],
+        [[0, 1, 2, 3], 0.06, 0.06],
+        [[0, 1, 3], 0.06, 0.06],
+        [[0, 2, 3], 0.06, 0.06],
+        [[1, 2], 0.06, 0.06],
+        [[1, 2, 3], 0.06, 0.06],
+        [[1, 3], 0.06, 0.06],
+        [[2, 3], 0.06, 0.06],
     ],
     ids=[
         "global_norm(dim=[])",
@@ -275,13 +304,24 @@ def run_moreh_norm_backward(
 )
 @pytest.mark.parametrize("keepdim", [True, False])
 @pytest.mark.parametrize("ttnn_dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
-def test_moreh_norm(input_shape, p, dim_rtol_atol, keepdim, ttnn_dtype, device):
+@pytest.mark.parametrize("is_linalg_vector_norm", [False, True])
+def test_moreh_norm(input_shape, p, dim_rtol_atol, keepdim, ttnn_dtype, device, is_linalg_vector_norm):
     """
     Parametrized test for ttnn's norm operation. Compares the output of ttnn's norm with torch's norm.
     """
     torch.manual_seed(2024)
     dim, rtol, atol = dim_rtol_atol
-    run_moreh_norm(input_shape, p, dim, rtol, atol, device, keepdim=keepdim, ttnn_dtype=ttnn_dtype)
+    run_moreh_norm(
+        input_shape,
+        p,
+        dim,
+        rtol,
+        atol,
+        device,
+        keepdim=keepdim,
+        ttnn_dtype=ttnn_dtype,
+        is_linalg_vector_norm=is_linalg_vector_norm,
+    )
 
 
 @pytest.mark.parametrize("p", [2.0, 2.5, -2.5])
@@ -300,7 +340,10 @@ def test_moreh_norm(input_shape, p, dim_rtol_atol, keepdim, ttnn_dtype, device):
     ],
 )
 @pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_moreh_norm_compute_kernel_options(input_shape, p, dim_rtol_atol, compute_kernel_options, device):
+@pytest.mark.parametrize("is_linalg_vector_norm", [False, True])
+def test_moreh_norm_compute_kernel_options(
+    input_shape, p, dim_rtol_atol, compute_kernel_options, device, is_linalg_vector_norm
+):
     """
     Parametrized test for ttnn's norm operation. Compares the output of ttnn's norm with torch's norm.
     """
@@ -314,6 +357,7 @@ def test_moreh_norm_compute_kernel_options(input_shape, p, dim_rtol_atol, comput
         atol,
         device,
         compute_kernel_options=compute_kernel_options,
+        is_linalg_vector_norm=is_linalg_vector_norm,
     )
 
 
@@ -329,7 +373,8 @@ def test_moreh_norm_compute_kernel_options(input_shape, p, dim_rtol_atol, comput
     ids=["CW", "N", "C", "H", "W"],
 )
 @pytest.mark.parametrize("keepdim", [True, False])
-def test_moreh_norm_callback(dim_rtol_atol, keepdim, device, use_program_cache):
+@pytest.mark.parametrize("is_linalg_vector_norm", [False, True])
+def test_moreh_norm_callback(dim_rtol_atol, keepdim, device, is_linalg_vector_norm, use_program_cache):
     """
     Parametrized test for ttnn's norm operation. Compares the output of ttnn's norm with torch's norm.
     """
@@ -337,7 +382,9 @@ def test_moreh_norm_callback(dim_rtol_atol, keepdim, device, use_program_cache):
     dim, rtol, atol = dim_rtol_atol
     num_program_cache_entries_list = []
     for i in range(2):
-        run_moreh_norm([5, 8, 78, 77], 2.0, dim, rtol, atol, device, keepdim=keepdim)
+        run_moreh_norm(
+            [5, 8, 78, 77], 2.0, dim, rtol, atol, device, keepdim=keepdim, is_linalg_vector_norm=is_linalg_vector_norm
+        )
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
         num_program_cache_entries_list.append(device.num_program_cache_entries())
@@ -350,21 +397,21 @@ def test_moreh_norm_callback(dim_rtol_atol, keepdim, device, use_program_cache):
 @pytest.mark.parametrize(
     "dim_rtol_atol",
     [
-        [[], 0.2, 0.2],
-        [None, 0.2, 0.2],
-        [0, 0.1, 0.1],
-        [1, 0.1, 0.1],
-        [2, 0.1, 0.1],
-        [3, 0.1, 0.1],
-        [[0, 1], 0.1, 0.1],
-        [[0, 1, 2], 0.15, 0.15],
-        [[0, 1, 2, 3], 0.2, 0.2],
-        [[0, 1, 3], 0.15, 0.15],
-        [[0, 2, 3], 0.15, 0.15],
-        [[1, 2], 0.1, 0.1],
-        [[1, 2, 3], 0.15, 0.15],
-        [[1, 3], 0.1, 0.1],
-        [[2, 3], 0.1, 0.1],
+        [[], 0.06, 0.06],
+        [None, 0.06, 0.06],
+        [0, 0.06, 0.06],
+        [1, 0.06, 0.06],
+        [2, 0.06, 0.06],
+        [3, 0.06, 0.06],
+        [[0, 1], 0.06, 0.06],
+        [[0, 1, 2], 0.06, 0.06],
+        [[0, 1, 2, 3], 0.06, 0.06],
+        [[0, 1, 3], 0.06, 0.06],
+        [[0, 2, 3], 0.06, 0.06],
+        [[1, 2], 0.06, 0.06],
+        [[1, 2, 3], 0.06, 0.06],
+        [[1, 3], 0.06, 0.06],
+        [[2, 3], 0.06, 0.06],
     ],
     ids=[
         "global_norm(dim=[])",
@@ -393,13 +440,24 @@ def test_moreh_norm_callback(dim_rtol_atol, keepdim, device, use_program_cache):
 )
 @pytest.mark.parametrize("keepdim", [True, False])
 @pytest.mark.parametrize("ttnn_dtype", [ttnn.bfloat16, ttnn.bfloat8_b])
-def test_moreh_norm_backward(input_shape, p, dim_rtol_atol, keepdim, ttnn_dtype, device):
+@pytest.mark.parametrize("is_linalg_vector_norm", [False, True])
+def test_moreh_norm_backward(input_shape, p, dim_rtol_atol, keepdim, ttnn_dtype, device, is_linalg_vector_norm):
     """
     Parametrized test for ttnn's norm backward operation. Compares the output of ttnn's norm backward with torch's norm backward.
     """
     torch.manual_seed(2024)
     dim, rtol, atol = dim_rtol_atol
-    run_moreh_norm_backward(input_shape, p, dim, rtol, atol, device, keepdim=keepdim, ttnn_dtype=ttnn_dtype)
+    run_moreh_norm_backward(
+        input_shape,
+        p,
+        dim,
+        rtol,
+        atol,
+        device,
+        keepdim=keepdim,
+        ttnn_dtype=ttnn_dtype,
+        is_linalg_vector_norm=is_linalg_vector_norm,
+    )
 
 
 @pytest.mark.parametrize("p", [2.0, 2.5, -2.5])
@@ -418,7 +476,10 @@ def test_moreh_norm_backward(input_shape, p, dim_rtol_atol, keepdim, ttnn_dtype,
     ],
 )
 @pytest.mark.parametrize("compute_kernel_options", compute_kernel_options, ids=compute_kernel_ids)
-def test_moreh_norm_backward_compute_kernel_options(input_shape, p, dim_rtol_atol, compute_kernel_options, device):
+@pytest.mark.parametrize("is_linalg_vector_norm", [False, True])
+def test_moreh_norm_backward_compute_kernel_options(
+    input_shape, p, dim_rtol_atol, compute_kernel_options, device, is_linalg_vector_norm
+):
     """
     Parametrized test for ttnn's norm backward operation. Compares the output of ttnn's norm backward with torch's norm backward.
     """
@@ -432,6 +493,7 @@ def test_moreh_norm_backward_compute_kernel_options(input_shape, p, dim_rtol_ato
         atol,
         device,
         compute_kernel_options=compute_kernel_options,
+        is_linalg_vector_norm=is_linalg_vector_norm,
     )
 
 
@@ -447,7 +509,8 @@ def test_moreh_norm_backward_compute_kernel_options(input_shape, p, dim_rtol_ato
     ids=["CW", "N", "C", "H", "W"],
 )
 @pytest.mark.parametrize("keepdim", [True, False])
-def test_moreh_norm_backward_callback(dim_rtol_atol, keepdim, device, use_program_cache):
+@pytest.mark.parametrize("is_linalg_vector_norm", [False, True])
+def test_moreh_norm_backward_callback(dim_rtol_atol, keepdim, device, is_linalg_vector_norm, use_program_cache):
     """
     Parametrized test for ttnn's norm backward operation. Compares the output of ttnn's norm backward with torch's norm backward.
     """
@@ -455,7 +518,9 @@ def test_moreh_norm_backward_callback(dim_rtol_atol, keepdim, device, use_progra
     dim, rtol, atol = dim_rtol_atol
     num_program_cache_entries_list = []
     for i in range(2):
-        run_moreh_norm_backward([5, 8, 78, 77], 2.0, dim, rtol, atol, device, keepdim=keepdim)
+        run_moreh_norm_backward(
+            [5, 8, 78, 77], 2.0, dim, rtol, atol, device, keepdim=keepdim, is_linalg_vector_norm=is_linalg_vector_norm
+        )
         torch_dummy = torch.randn([32, 32])
         ttnn_dummy = ttnn.from_torch(torch_dummy, device=device)
         num_program_cache_entries_list.append(device.num_program_cache_entries())
diff --git a/tests/ttnn/unit_tests/operations/test_slice.py b/tests/ttnn/unit_tests/operations/test_slice.py
index a85b33ada9a..9facb46a90c 100644
--- a/tests/ttnn/unit_tests/operations/test_slice.py
+++ b/tests/ttnn/unit_tests/operations/test_slice.py
@@ -752,8 +752,37 @@ def test_slice_adversarial_fixed(input_shape, dim, start, end, step, layout, dev
 @pytest.mark.parametrize(
     "input_shape, dim, start, end, step, layout",
     (
-        ([8732, 4], 1, 0, -1, 4, ttnn.TILE_LAYOUT),  # Need tensor for this or a padding aware tiled kernel
         ([1, 7], 0, 0, -1, 1, ttnn.ROW_MAJOR_LAYOUT),  # page size must equal buffer size
+        ([1, 8, 2, 2], 2, -1, -1, 1, ttnn.TILE_LAYOUT),  # Buffer size and page size should be larger than 0 bytes
+        ([3], 0, 0, -1, 1, ttnn.TILE_LAYOUT),  # Difference in expected shape as it's a 1D tensor
+    ),
+)
+def test_slice_adversarial(input_shape, dim, start, end, step, layout, device):
+    pytest.skip("These tests are known to fail")
+    torch_input = torch.randn(input_shape, dtype=torch.bfloat16)
+
+    slice_obj = slice(start, end, step)
+
+    # Prepare indices for slicing in the specified dimension
+    indices = [slice(None)] * len(input_shape)  # By default, select all elements along every dimension
+    indices[dim] = slice_obj  # Apply slicing to the target dimension
+    indices = tuple(indices)
+
+    # Apply slicing to the input_tensor
+    torch_output_tensor = torch_input[indices]
+
+    ttnn_tensor = ttnn.from_torch(torch_input, device=device, layout=layout, dtype=ttnn.bfloat16)
+    ttnn_output = ttnn_tensor[indices]
+
+    ttnn_output_tensor = ttnn.to_torch(ttnn_output)
+
+    assert_with_pcc(torch_output_tensor, ttnn_output_tensor, 0.999)
+
+
+@pytest.mark.parametrize(
+    "input_shape, dim, start, end, step, layout",
+    (
+        ([8732, 4], 1, 0, -1, 4, ttnn.TILE_LAYOUT),  # Need tensor for this or a padding aware tiled kernel
         (
             [1, 7, 71, 64],
             3,
@@ -762,12 +791,9 @@ def test_slice_adversarial_fixed(input_shape, dim, start, end, step, layout, dev
             1,
             ttnn.ROW_MAJOR_LAYOUT,
         ),  # An unpadding slice operations for a RowMajor layout on the output tensor requires the last dimension to be on a 32 bit boundary
-        ([1, 8, 2, 2], 2, -1, -1, 1, ttnn.TILE_LAYOUT),  # Buffer size and page size should be larger than 0 bytes
-        ([3], 0, 0, -1, 1, ttnn.TILE_LAYOUT),  # Difference in expected shape as it's a 1D tensor
     ),
 )
-def test_slice_adversarial(input_shape, dim, start, end, step, layout, device):
-    pytest.skip("These tests are expected to fail at the moment")
+def test_slice_adversarial_fixed(input_shape, dim, start, end, step, layout, device):
     torch_input = torch.randn(input_shape, dtype=torch.bfloat16)
 
     slice_obj = slice(start, end, step)
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_alignment.py b/tests/ttnn/unit_tests/tensor/test_tensor_alignment.py
similarity index 100%
rename from tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_alignment.py
rename to tests/ttnn/unit_tests/tensor/test_tensor_alignment.py
diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py b/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py
new file mode 100644
index 00000000000..6e00f178fc0
--- /dev/null
+++ b/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py
@@ -0,0 +1,191 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import os
+import pathlib
+
+import torch
+import numpy as np
+
+import ttnn
+
+tt_dtype_to_torch_dtype = {
+    ttnn.uint8: torch.uint8,
+    ttnn.uint16: torch.int16,
+    ttnn.uint32: torch.int32,
+    ttnn.int32: torch.int32,
+    ttnn.float32: torch.float,
+    ttnn.bfloat16: torch.bfloat16,
+    ttnn.bfloat8_b: torch.float,
+    ttnn.bfloat4_b: torch.float,
+}
+
+tt_dtype_to_np_dtype = {
+    ttnn.uint8: np.ubyte,
+    ttnn.uint16: np.int16,
+    ttnn.uint32: np.int32,
+    ttnn.int32: np.int32,
+    ttnn.float32: np.float32,
+    ttnn.bfloat8_b: np.float32,
+    ttnn.bfloat4_b: np.float32,
+}
+
+
+@pytest.mark.parametrize(
+    "tt_dtype",
+    [
+        ttnn.uint8,
+        ttnn.uint16,
+        ttnn.uint32,
+        ttnn.int32,
+        ttnn.float32,
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+        ttnn.bfloat4_b,
+    ],
+)
+@pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
+@pytest.mark.parametrize("python_lib", [torch, np])
+def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device):
+    torch.manual_seed(0)
+
+    if python_lib == torch:
+        dtype = tt_dtype_to_torch_dtype[tt_dtype]
+
+        if dtype in {torch.uint8, torch.int16, torch.int32}:
+            py_tensor = torch.randint(torch.iinfo(dtype).min, torch.iinfo(dtype).max, shape, dtype=dtype)
+        else:
+            py_tensor = torch.rand(shape, dtype=dtype)
+
+        from torch import allclose
+
+    elif python_lib == np:
+        if tt_dtype == ttnn.bfloat16:
+            pytest.skip("ttnn.bloat16 dtype is not supported yet for numpy tensors!")
+        dtype = tt_dtype_to_np_dtype[tt_dtype]
+
+        if dtype in {np.ubyte, np.int16, np.int32}:
+            py_tensor = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max, shape, dtype=dtype)
+        else:
+            py_tensor = np.random.random(shape).astype(dtype=dtype)
+
+        from numpy import allclose
+
+    tt_tensor = ttnn.Tensor(py_tensor, tt_dtype)
+    if tt_dtype in {ttnn.bfloat8_b, ttnn.bfloat4_b}:
+        assert tt_tensor.storage_type() == ttnn.StorageType.OWNED
+        tt_tensor = tt_tensor.to(ttnn.TILE_LAYOUT)
+    else:
+        assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED
+
+    tt_tensor = tt_tensor.to(device)
+    tt_tensor = tt_tensor.cpu()
+
+    if tt_dtype in {ttnn.bfloat8_b, ttnn.bfloat4_b}:
+        tt_tensor = tt_tensor.to(ttnn.ROW_MAJOR_LAYOUT)
+
+    if python_lib == torch:
+        py_tensor_after_round_trip = tt_tensor.to_torch()
+    elif python_lib == np:
+        py_tensor_after_round_trip = tt_tensor.to_numpy()
+
+    assert py_tensor.dtype == py_tensor_after_round_trip.dtype
+    assert py_tensor.shape == py_tensor_after_round_trip.shape
+
+    allclose_kwargs = {}
+    if tt_dtype == ttnn.bfloat8_b:
+        allclose_kwargs = dict(atol=1e-2)
+    elif tt_dtype == ttnn.bfloat4_b:
+        allclose_kwargs = dict(atol=0.2)
+
+    passing = allclose(py_tensor, py_tensor_after_round_trip, **allclose_kwargs)
+    assert passing
+
+
+string_to_torch_dtype = {
+    "uint8": torch.uint8,
+    "int16": torch.int16,
+    "int32": torch.int32,
+    "int64": torch.int64,
+    "bfloat16": torch.bfloat16,
+    "float16": torch.float16,
+    "float32": torch.float,
+}
+
+string_to_np_dtype = {
+    "uint8": np.ubyte,
+    "int16": np.int16,
+    "int32": np.int32,
+    "int64": np.int64,
+    "float16": np.float16,
+    "float32": np.float32,
+}
+
+
+@pytest.mark.parametrize(
+    "python_dtype_str",
+    [
+        "uint8",
+        "int16",
+        "int32",
+        "int64",
+        "bfloat16",
+        "float16",
+        "float32",
+    ],
+)
+@pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
+@pytest.mark.parametrize("python_lib", [torch, np])
+def test_tensor_conversion_with_python_dtype(python_lib, shape, python_dtype_str, device):
+    torch.manual_seed(0)
+
+    if python_lib == torch:
+        dtype = string_to_torch_dtype[python_dtype_str]
+
+        if dtype in {torch.uint8, torch.int16, torch.int32, torch.int64}:
+            py_tensor = torch.randint(torch.iinfo(dtype).min, torch.iinfo(dtype).max, shape, dtype=dtype)
+        else:
+            py_tensor = torch.rand(shape, dtype=dtype)
+
+        from torch import allclose
+
+    elif python_lib == np:
+        if python_dtype_str in ("bfloat16", "float16"):
+            pytest.skip("{} dtype is not supported yet for numpy tensors!".format(python_dtype_str))
+        dtype = string_to_np_dtype[python_dtype_str]
+
+        if dtype in {np.ubyte, np.int16, np.int32, np.int64}:
+            py_tensor = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max, shape, dtype=dtype)
+        else:
+            py_tensor = np.random.random(shape).astype(dtype=dtype)
+
+        from numpy import allclose
+
+    tt_tensor = ttnn.Tensor(py_tensor)
+    assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED
+
+    tt_tensor = tt_tensor.to(device)
+    tt_tensor = tt_tensor.cpu()
+
+    if python_lib == torch:
+        py_tensor_after_round_trip = tt_tensor.to_torch()
+    elif python_lib == np:
+        py_tensor_after_round_trip = tt_tensor.to_numpy()
+
+    if python_dtype_str in ("int64", "float16"):
+        pytest.xfail(
+            "{} dtype is incorrectly handled in ttnn tensors, so roundtrip tests are not working!".format(
+                python_dtype_str
+            )
+        )
+
+    assert py_tensor.dtype == py_tensor_after_round_trip.dtype
+    assert py_tensor.shape == py_tensor_after_round_trip.shape
+
+    allclose_kwargs = {}
+
+    passing = allclose(py_tensor, py_tensor_after_round_trip, **allclose_kwargs)
+    assert passing
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_prealloc_and_write.py b/tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py
similarity index 100%
rename from tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_prealloc_and_write.py
rename to tests/ttnn/unit_tests/tensor/test_tensor_prealloc_and_write.py
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_ranks.py b/tests/ttnn/unit_tests/tensor/test_tensor_ranks.py
similarity index 100%
rename from tests/tt_eager/python_api_testing/unit_testing/misc/test_tensor_ranks.py
rename to tests/ttnn/unit_tests/tensor/test_tensor_ranks.py
diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py b/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py
new file mode 100644
index 00000000000..1db497c0843
--- /dev/null
+++ b/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import os
+import pathlib
+
+import torch
+import numpy as np
+
+import ttnn
+
+tt_dtype_to_torch_dtype = {
+    ttnn.uint16: torch.int16,
+    ttnn.uint32: torch.int32,
+    ttnn.float32: torch.float,
+    ttnn.bfloat16: torch.bfloat16,
+    ttnn.bfloat8_b: torch.float,
+    ttnn.bfloat4_b: torch.float,
+}
+
+
+@pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
+@pytest.mark.parametrize(
+    "tt_dtype",
+    [
+        ttnn.uint16,
+        ttnn.uint32,
+        ttnn.float32,
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+        ttnn.bfloat4_b,
+    ],
+)
+def test_serialization(tmp_path, shape, tt_dtype):
+    torch.manual_seed(0)
+
+    dtype = tt_dtype_to_torch_dtype[tt_dtype]
+
+    if dtype in {torch.int16, torch.int32}:
+        torch_tensor = torch.randint(0, 1024, shape, dtype=dtype)
+    else:
+        torch_tensor = torch.rand(shape, dtype=dtype)
+
+    tt_tensor = ttnn.Tensor(torch_tensor, tt_dtype)
+
+    file_name = tmp_path / pathlib.Path("tensor.bin")
+    ttnn.dump_tensor(str(file_name), tt_tensor)
+    torch_tensor_from_file = ttnn.load_tensor(str(file_name)).to_torch()
+
+    assert torch_tensor.dtype == torch_tensor_from_file.dtype
+    assert torch_tensor.shape == torch_tensor_from_file.shape
+
+    allclose_kwargs = {}
+    if tt_dtype == ttnn.bfloat8_b:
+        allclose_kwargs = dict(atol=1e-2)
+    elif tt_dtype == ttnn.bfloat4_b:
+        allclose_kwargs = dict(atol=0.2)
+
+    passing = torch.allclose(torch_tensor, torch_tensor_from_file, **allclose_kwargs)
+    assert passing
diff --git a/tests/ttnn/unit_tests/test_to_and_from_torch.py b/tests/ttnn/unit_tests/test_to_and_from_torch.py
index 8bd62c6a5fd..4b84f8ea120 100644
--- a/tests/ttnn/unit_tests/test_to_and_from_torch.py
+++ b/tests/ttnn/unit_tests/test_to_and_from_torch.py
@@ -76,3 +76,10 @@ def test_to_and_from_2D(height, width, dtype, layout):
         if dtype == ttnn.bfloat8_b:
             allclose_kwargs["atol"] = 1e-2
         assert torch.allclose(torch_input_tensor, torch_output_tensor, **allclose_kwargs)
+
+
+def test_from_torch_large(device):
+    torch_x = torch.rand((2048, 1024, 32, 32), dtype=torch.bfloat16)
+    x_tensor = ttnn.from_torch(torch_x, layout=ttnn.TILE_LAYOUT)
+    x_tensor = ttnn.to_torch(x_tensor)
+    assert torch.allclose(torch_x, x_tensor)
diff --git a/tests/ttnn/unit_tests/test_to_layout.py b/tests/ttnn/unit_tests/test_to_layout.py
index fafab9674a1..b84a8f4c5fc 100644
--- a/tests/ttnn/unit_tests/test_to_layout.py
+++ b/tests/ttnn/unit_tests/test_to_layout.py
@@ -10,6 +10,7 @@
 import ttnn
 
 from tests.ttnn.utils_for_testing import assert_with_pcc, check_with_pcc_without_tensor_printout
+from models.utility_functions import is_grayskull, is_blackhole, torch_random, skip_for_grayskull
 
 
 @pytest.mark.parametrize("height", [32, 30])
@@ -125,3 +126,17 @@ def test_untilize_with_unpadding_W_16(device, in_dtype, use_multicore, use_pack_
     passing, pcc_msg = check_with_pcc_without_tensor_printout(torch_input, output_torch)
     logger.info(pcc_msg)
     assert passing
+
+
+@pytest.mark.parametrize("h", [1, 18, 65])
+@pytest.mark.parametrize("w", [1, 15, 17, 29, 33, 49, 63, 65])
+@pytest.mark.parametrize("input_layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT])
+@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT])
+def test_to_layout_device(device, h, w, input_layout, output_layout):
+    torch.manual_seed(2005)
+    torch_input_tensor = torch_random((h, w), -0.1, 0.1, dtype=torch.bfloat16)
+    input_tensor = ttnn.from_torch(torch_input_tensor, device=device, dtype=ttnn.bfloat16, layout=input_layout)
+    new_layout_tensor = ttnn.to_layout(input_tensor, layout=output_layout)
+    torch_brought_back = ttnn.to_torch(new_layout_tensor)
+
+    assert_with_pcc(torch_input_tensor, torch_brought_back)
diff --git a/tt_metal/common/test_tiles.hpp b/tt_metal/common/test_tiles.hpp
index 8fac4cef897..50674abc39d 100644
--- a/tt_metal/common/test_tiles.hpp
+++ b/tt_metal/common/test_tiles.hpp
@@ -207,18 +207,18 @@ inline std::vector<T> untilize_nchw(const BufferType<T>& in, tt::stl::Span<const
     TT_ASSERT(shape[shape.size() - 2] % tile_H == 0 && shape[shape.size() - 1] % tile_W == 0);
 
     // Untilize into row major
-    int H = shape[shape.size() - 2], W = shape[shape.size() - 1];
-    auto batch_size = 1;
-    for (int i = 0; i < shape.size() - 2; i++) {
+    uint32_t H = shape[shape.size() - 2], W = shape[shape.size() - 1];
+    uint64_t batch_size = 1;
+    for (uint32_t i = 0; i < shape.size() - 2; i++) {
         batch_size *= shape[i];
     }
     result.resize(batch_size * H * W);
-    uint32_t linear = 0;
+    uint64_t linear = 0;
     for (auto batch_index = 0; batch_index < batch_size; batch_index++) {
-        for (int hs = 0; hs < H; hs += tile_H) {        // iterate over h with stride 32
-            for (int ws = 0; ws < W; ws += tile_W) {    // iterate over w with stride 32
-                for (int ht = 0; ht < tile_H; ht++) {      // hs + ht = h
-                    for (int wt = 0; wt < tile_W; wt++) {  // ws + wt = w
+        for (auto hs = 0; hs < H; hs += tile_H) {        // iterate over h with stride 32
+            for (auto ws = 0; ws < W; ws += tile_W) {    // iterate over w with stride 32
+                for (auto ht = 0; ht < tile_H; ht++) {      // hs + ht = h
+                    for (auto wt = 0; wt < tile_W; wt++) {  // ws + wt = w
                         T val = in[linear];
                         auto w = wt + ws;
                         auto h = ht + hs;
@@ -249,33 +249,33 @@ inline std::vector<T> tilize_nchw(const BufferType<T>& in_rowmajor, tt::stl::Spa
         return tilized_result;
     }
 
-    int H = shape[shape.size() - 2], W = shape[shape.size() - 1];
-    auto batch_size = 1;
-    for (int i = 0; i < shape.size() - 2; i++) {
+    uint32_t H = shape[shape.size() - 2], W = shape[shape.size() - 1];
+    uint64_t batch_size = 1;
+    for (uint32_t i = 0; i < shape.size() - 2; i++) {
         batch_size *= shape[i];
     }
-    int input_volume = batch_size * H * W;
+    uint64_t input_volume = batch_size * H * W;
     auto tile_H = tile_shape.has_value() ? tile_shape.value()[0] : tt::constants::TILE_HEIGHT;
     auto tile_W = tile_shape.has_value() ? tile_shape.value()[1] : tt::constants::TILE_WIDTH;
-    int OH = round_up_to_tile(H, tile_H);
-    int OW = round_up_to_tile(W, tile_W);
+    uint32_t OH = round_up_to_tile(H, tile_H);
+    uint32_t OW = round_up_to_tile(W, tile_W);
     tilized_result.resize(batch_size * OH * OW);
     std::fill(tilized_result.begin(), tilized_result.end(), 0);
-    int out_index = 0;
+    uint64_t out_index = 0;
     for (auto batch_index = 0; batch_index < batch_size; batch_index++) {
-        for (int hs = 0; hs < H; hs += tile_H) {
-            for (int ws = 0; ws < W; ws += tile_W) {
-                for (int ht = 0; ht < tile_H; ht++) {
-                    for (int wt = 0; wt < tile_W; wt++) {
+        for (auto hs = 0; hs < H; hs += tile_H) {
+            for (auto ws = 0; ws < W; ws += tile_W) {
+                for (auto ht = 0; ht < tile_H; ht++) {
+                    for (auto wt = 0; wt < tile_W; wt++) {
                         auto w = wt + ws;
                         auto h = ht + hs;
                         auto in_offs = w + h * W + batch_index * H * W;
                         auto val = (w >= W || h >= H || in_offs >= input_volume) ? 0 : in_rowmajor[in_offs];
-                        int out_w = (out_index % OW);
-                        int out_h = (out_index / OW) % OH;
+                        auto out_w = (out_index % OW);
+                        auto out_h = (out_index / OW) % OH;
                         TT_ASSERT(w < OW);
                         TT_ASSERT(h < OH);
-                        int out_offs = out_w + out_h * OW + batch_index * OH * OW;
+                        auto out_offs = out_w + out_h * OW + batch_index * OH * OW;
                         tilized_result[out_offs] = val;
                         out_index++;
                     }
diff --git a/tt_metal/hostdevcommon/kernel_structs.h b/tt_metal/hostdevcommon/kernel_structs.h
index bd2e66914e9..9dc16eb1367 100644
--- a/tt_metal/hostdevcommon/kernel_structs.h
+++ b/tt_metal/hostdevcommon/kernel_structs.h
@@ -7,8 +7,44 @@
 
 namespace tt {
 
-// All CBs can used for dataflow in/out
-// Certain CBs are specifically designed to handle compute input, output, and intermediates.
+enum CBIndex : std::uint8_t
+{
+  c_0        = 0,
+  c_1        = 1,
+  c_2        = 2,
+  c_3        = 3,
+  c_4        = 4,
+  c_5        = 5,
+  c_6        = 6,
+  c_7        = 7,
+  c_8        = 8,
+  c_9        = 9,
+  c_10       = 10,
+  c_11       = 11,
+  c_12       = 12,
+  c_13       = 13,
+  c_14       = 14,
+  c_15       = 15,
+  c_16       = 16,
+  c_17       = 17,
+  c_18       = 18,
+  c_19       = 19,
+  c_20       = 20,
+  c_21       = 21,
+  c_22       = 22,
+  c_23       = 23,
+  c_24       = 24,
+  c_25       = 25,
+  c_26       = 26,
+  c_27       = 27,
+  c_28       = 28,
+  c_29       = 29,
+  c_30       = 30,
+  c_31       = 31,
+  SIZE = 32
+};
+
+// Deprecated and to be deleted.
 enum CB : std::uint8_t
 {
   // Designed to be used as compute inputs, or dataflow in/out
@@ -51,6 +87,7 @@ enum CB : std::uint8_t
   c_intermed6 = 30,
   c_intermed7 = 31,
 };
+
   /////////////////////////////
  // end of user facing APIs //
 /////////////////////////////
diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt
index 17c5370a24c..c4208b98b85 100644
--- a/tt_metal/hw/CMakeLists.txt
+++ b/tt_metal/hw/CMakeLists.txt
@@ -107,6 +107,7 @@ set(GPP_FLAGS_common
     -fno-exceptions
     -Wall
     -Werror
+    -Wno-deprecated-declarations
     -Wno-unknown-pragmas
     -Wno-error=multistatement-macros
     -Wno-error=parentheses
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
index 5b314ad8d9a..47343d42b48 100644
--- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -20,23 +20,32 @@ inline void calculate_mask() {
 #pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++) {
         vFloat mask = dst_reg[mask_val_idx];
-        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) {
-            dst_reg[0] = vConst0;
-        }
+        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) { dst_reg[0] = vConst0; }
         v_endif;
         dst_reg++;
     }
 }
 
-template <bool APPROXIMATION_MODE, int ITERATIONS=8>
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
 inline void calculate_int_mask() {
     const int mask_idx = 32;
-    #pragma GCC unroll 8
+#pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++) {
         vInt mask = dst_reg[mask_idx];
-        v_if (mask == 0) {
-            dst_reg[0] = vConst0;
-        }
+        v_if(mask == 0) { dst_reg[0] = vConst0; }
+        v_endif;
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_mask_posinf() {
+    const bool exponent_size_8 = true;
+    const int mask_val_idx = 32;
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
+        vFloat mask = dst_reg[mask_val_idx];
+        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) { dst_reg[0] = std::numeric_limits<float>::infinity(); }
         v_endif;
         dst_reg++;
     }
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
index 172b87b9dc8..c5dfca8bac9 100644
--- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
@@ -4,9 +4,9 @@
 
 #pragma once
 
+#include "ckernel_sfpu_mask.h"
 #include "llk_math_eltwise_unary_sfpu_init.h"
 #include "llk_math_eltwise_unary_sfpu_params.h"
-#include "ckernel_sfpu_mask.h"
 
 namespace ckernel {
 
@@ -18,7 +18,14 @@ inline void llk_math_eltwise_unary_sfpu_mask_init() {
 }
 
 template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, DataFormat data_format, int vector_mode = (int)VectorMode::RC) {
+inline void llk_math_eltwise_unary_sfpu_mask_posinf(uint dst_index, int vector_mode = (int)VectorMode::RC) {
+    llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_mask_posinf<APPROXIMATE>, dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_mask(
+    uint dst_index, DataFormat data_format, int vector_mode = (int)VectorMode::RC) {
     if (data_format == DataFormat::Float16_b || data_format == DataFormat::Float16) {
         llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
             ckernel::sfpu::calculate_mask<APPROXIMATE>, dst_index, vector_mode);
@@ -28,4 +35,4 @@ inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, DataFormat data_for
     }
 }
 
-}
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
index acaba06d284..8f1c90e63cc 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -7,7 +7,6 @@
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "noc_nonblocking_api.h"
-
 #include "sfpi.h"
 
 using namespace sfpi;
@@ -15,18 +14,27 @@ using namespace sfpi;
 namespace ckernel {
 namespace sfpu {
 
-template <bool APPROXIMATION_MODE, int ITERATIONS=4>
-inline void calculate_mask()
-{
+template <bool APPROXIMATION_MODE, int ITERATIONS = 4>
+inline void calculate_mask() {
+    const bool exponent_size_8 = true;
+    const int mask_val_idx = 16;
+#pragma GCC unroll 4
+    for (int d = 0; d < ITERATIONS; d++) {
+        vFloat mask = dst_reg[mask_val_idx];
+        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) { dst_reg[0] = 0; }
+        v_endif;
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_mask_posinf() {
     const bool exponent_size_8 = true;
     const int mask_val_idx = 16;
-    #pragma GCC unroll 4
-    for (int d = 0; d < ITERATIONS; d++)
-    {
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
         vFloat mask = dst_reg[mask_val_idx];
-        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) {
-            dst_reg[0] = 0;
-        }
+        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) { dst_reg[0] = std::numeric_limits<float>::infinity(); }
         v_endif;
         dst_reg++;
     }
diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
index 82831f2a995..e717341255d 100644
--- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
@@ -4,9 +4,9 @@
 
 #pragma once
 
+#include "ckernel_sfpu_mask.h"
 #include "llk_math_eltwise_unary_sfpu_init.h"
 #include "llk_math_eltwise_unary_sfpu_params.h"
-#include "ckernel_sfpu_mask.h"
 
 namespace ckernel {
 
@@ -18,11 +18,18 @@ inline void llk_math_eltwise_unary_sfpu_mask_init() {
 }
 
 template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, DataFormat data_format, int vector_mode = (int)VectorMode::RC) {
+inline void llk_math_eltwise_unary_sfpu_mask_posinf(uint dst_index, int vector_mode = (int)VectorMode::RC) {
+    llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_mask_posinf<APPROXIMATE>, dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_mask(
+    uint dst_index, DataFormat data_format, int vector_mode = (int)VectorMode::RC) {
     if (data_format == DataFormat::Float16_b || data_format == DataFormat::Float16) {
         llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
             ckernel::sfpu::calculate_mask<APPROXIMATE>, dst_index, vector_mode);
     }
 }
 
-}
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
index 934dfbf22a6..05a72184064 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_mask.h
@@ -7,7 +7,6 @@
 #include "ckernel.h"
 #include "ckernel_defs.h"
 #include "noc_nonblocking_api.h"
-
 #include "sfpi.h"
 
 using namespace sfpi;
@@ -15,35 +14,39 @@ using namespace sfpi;
 namespace ckernel {
 namespace sfpu {
 
-
-template <bool APPROXIMATION_MODE, int ITERATIONS=8>
-inline void calculate_mask()
-{
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_mask() {
     const bool exponent_size_8 = true;
     const int mask_val_idx = 32;
-    #pragma GCC unroll 8
-    for (int d = 0; d < ITERATIONS; d++)
-    {
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
         vFloat mask = dst_reg[mask_val_idx];
-        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) {
-            dst_reg[0] = vConst0;
-        }
+        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) { dst_reg[0] = vConst0; }
         v_endif;
         dst_reg++;
     }
 }
 
-template <bool APPROXIMATION_MODE, int ITERATIONS=8>
-inline void calculate_int_mask()
-{
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_int_mask() {
     const int mask_idx = 32;
-    #pragma GCC unroll 8
-    for (int d = 0; d < ITERATIONS; d++)
-    {
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
         vInt mask = dst_reg[mask_idx];
-        v_if (mask == 0) {
-            dst_reg[0] = vConst0;
-        }
+        v_if(mask == 0) { dst_reg[0] = vConst0; }
+        v_endif;
+        dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_mask_posinf() {
+    const bool exponent_size_8 = true;
+    const int mask_val_idx = 32;
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
+        vFloat mask = dst_reg[mask_val_idx];
+        v_if(_sfpu_is_fp16_zero_(mask, exponent_size_8)) { dst_reg[0] = std::numeric_limits<float>::infinity(); }
         v_endif;
         dst_reg++;
     }
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
index 172b87b9dc8..c5dfca8bac9 100644
--- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_unary_sfpu_mask.h
@@ -4,9 +4,9 @@
 
 #pragma once
 
+#include "ckernel_sfpu_mask.h"
 #include "llk_math_eltwise_unary_sfpu_init.h"
 #include "llk_math_eltwise_unary_sfpu_params.h"
-#include "ckernel_sfpu_mask.h"
 
 namespace ckernel {
 
@@ -18,7 +18,14 @@ inline void llk_math_eltwise_unary_sfpu_mask_init() {
 }
 
 template <bool APPROXIMATE>
-inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, DataFormat data_format, int vector_mode = (int)VectorMode::RC) {
+inline void llk_math_eltwise_unary_sfpu_mask_posinf(uint dst_index, int vector_mode = (int)VectorMode::RC) {
+    llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_mask_posinf<APPROXIMATE>, dst_index, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_unary_sfpu_mask(
+    uint dst_index, DataFormat data_format, int vector_mode = (int)VectorMode::RC) {
     if (data_format == DataFormat::Float16_b || data_format == DataFormat::Float16) {
         llk_math_eltwise_unary_sfpu_params<APPROXIMATE>(
             ckernel::sfpu::calculate_mask<APPROXIMATE>, dst_index, vector_mode);
@@ -28,4 +35,4 @@ inline void llk_math_eltwise_unary_sfpu_mask(uint dst_index, DataFormat data_for
     }
 }
 
-}
+}  // namespace ckernel
diff --git a/tt_metal/hw/inc/debug/dprint_test_common.h b/tt_metal/hw/inc/debug/dprint_test_common.h
index a1ff26e9df1..5b7be4193a0 100644
--- a/tt_metal/hw/inc/debug/dprint_test_common.h
+++ b/tt_metal/hw/inc/debug/dprint_test_common.h
@@ -30,17 +30,17 @@ inline void print_test_data() {
 #if !defined(COMPILE_FOR_ERISC) && !defined(COMPILE_FOR_IDLE_ERISC)
     // Eth cores don't have CBs, so don't try TSLICE printing.
     DPRINT << "SLICE:\n";
-    cb_wait_front(tt::CB::c_in0, 1);
+    cb_wait_front(tt::CBIndex::c_0, 1);
 #if defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC)
     // Since brisc is writing to the CB before printing, should look at read pointer
-    DPRINT << TSLICE(tt::CB::c_in0, 0, SliceRange::hw0_32_8(), TSLICE_INPUT_CB, TSLICE_RD_PTR);
-    DPRINT << TSLICE(tt::CB::c_in0, 0, SliceRange::hw0_32_4(), TSLICE_INPUT_CB, TSLICE_RD_PTR);
+    DPRINT << TSLICE(tt::CBIndex::c_0, 0, SliceRange::hw0_32_8(), TSLICE_INPUT_CB, TSLICE_RD_PTR);
+    DPRINT << TSLICE(tt::CBIndex::c_0, 0, SliceRange::hw0_32_4(), TSLICE_INPUT_CB, TSLICE_RD_PTR);
     // This one has an unsupported data type, should show a warning instead of data
-    DPRINT << TSLICE(tt::CB::c_in1, 0, SliceRange::hw0_32_4(), TSLICE_INPUT_CB, TSLICE_RD_PTR);
+    DPRINT << TSLICE(tt::CBIndex::c_1, 0, SliceRange::hw0_32_4(), TSLICE_INPUT_CB, TSLICE_RD_PTR);
 #else
-    DPRINT << TSLICE(tt::CB::c_in0, 0, SliceRange::hw0_32_8());
-    DPRINT << TSLICE(tt::CB::c_in0, 0, SliceRange::hw0_32_4());
-    DPRINT << TSLICE(tt::CB::c_in1, 0, SliceRange::hw0_32_4());
+    DPRINT << TSLICE(tt::CBIndex::c_0, 0, SliceRange::hw0_32_8());
+    DPRINT << TSLICE(tt::CBIndex::c_0, 0, SliceRange::hw0_32_4());
+    DPRINT << TSLICE(tt::CBIndex::c_1, 0, SliceRange::hw0_32_4());
 #endif
 #endif
 }
diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt
index bd156e29c36..fa597b634a6 100644
--- a/tt_metal/impl/CMakeLists.txt
+++ b/tt_metal/impl/CMakeLists.txt
@@ -6,6 +6,7 @@ set(IMPL_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/device/device_pool.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/buffers/buffer.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/buffers/circular_buffer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/buffers/circular_buffer_types.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/buffers/global_semaphore.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/buffers/semaphore.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernels/kernel.cpp
@@ -24,6 +25,7 @@ set(IMPL_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/debug/watcher_device_reader.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/trace/trace.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/trace/trace_buffer.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/event/event.cpp
 )
 
 add_library(impl OBJECT ${IMPL_SRC})
diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp
index ac492ccd2c1..1517507ba2d 100644
--- a/tt_metal/impl/buffers/buffer.cpp
+++ b/tt_metal/impl/buffers/buffer.cpp
@@ -4,15 +4,16 @@
 
 #include "tt_metal/impl/buffers/buffer.hpp"
 
+#include "tt_metal/buffer.hpp"
 #include "tt_metal/common/assert.hpp"
 #include "tt_metal/common/math.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/impl/allocator/allocator.hpp"
 #include "tt_metal/impl/device/device.hpp"
+#include "tt_metal/types.hpp"
 
 #include <algorithm>
 #include <mutex>
-#include <string>
 #include <utility>
 #include "tt_metal/common/base.hpp"
 #include "tt_metal/impl/buffers/buffer_constants.hpp"
@@ -514,6 +515,22 @@ DeviceAddr ShardSpecBuffer::size() const {
     return shape_in_pages_[0] * shape_in_pages_[1];
 }
 
+v1::BufferHandle v1::CreateBuffer(InterleavedBufferConfig config) { return v1::BufferHandle{v0::CreateBuffer(config)}; }
+
+void v1::DeallocateBuffer(BufferHandle buffer) { v0::DeallocateBuffer(*buffer); }
+
+void v1::WriteToBuffer(BufferHandle buffer, stl::Span<const std::byte> host_buffer) {
+    detail::WriteToBuffer(*buffer, stl::Span<const uint8_t>{reinterpret_cast<const std::uint8_t *>(host_buffer.data()), host_buffer.size()});
+}
+
+void v1::ReadFromBuffer(BufferHandle buffer, stl::Span<std::byte> host_buffer, bool shard_order) {
+    detail::ReadFromBuffer(*buffer, reinterpret_cast<std::uint8_t *>(host_buffer.data()), shard_order);
+}
+
+void v1::ReadFromShard(BufferHandle buffer, stl::Span<std::byte> host_buffer, std::uint32_t core_id) {
+    detail::ReadShard(*buffer, reinterpret_cast<std::uint8_t *>(host_buffer.data()), core_id);
+}
+
 }  // namespace tt_metal
 }  // namespace tt
 
diff --git a/tt_metal/impl/buffers/circular_buffer_types.cpp b/tt_metal/impl/buffers/circular_buffer_types.cpp
new file mode 100644
index 00000000000..874338430f6
--- /dev/null
+++ b/tt_metal/impl/buffers/circular_buffer_types.cpp
@@ -0,0 +1,182 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "circular_buffer_types.hpp"
+
+namespace tt::tt_metal {
+inline namespace v0 {
+
+// Static circular buffer spec
+CircularBufferConfig::CircularBufferConfig(
+    uint32_t total_size, const std::map<uint8_t, tt::DataFormat> &data_format_spec) :
+    total_size_(total_size), globally_allocated_address_(std::nullopt), dynamic_cb_(false) {
+    this->set_config(data_format_spec);
+}
+
+// User is expected to use the builder here.
+CircularBufferConfig::CircularBufferConfig(uint32_t total_size) :
+    total_size_(total_size), globally_allocated_address_(std::nullopt), dynamic_cb_(false) {}
+
+// Dynamic circular buffer spec
+CircularBufferConfig::CircularBufferConfig(
+    uint32_t total_size, const std::map<uint8_t, tt::DataFormat> &data_format_spec, const Buffer &buffer) :
+    total_size_(total_size), dynamic_cb_(true), max_size_(buffer.size()) {
+    if (not buffer.is_l1()) {
+        TT_THROW("Only L1 buffers can have an associated circular buffer!");
+    }
+    if (total_size > buffer.size()) {
+        TT_THROW(
+            "Requested {} B but dynamic circular buffer cannot be larger than allocated L1 buffer of {} B",
+            total_size,
+            buffer.size());
+    }
+    this->set_globally_allocated_address(buffer);
+    this->set_config(data_format_spec);
+}
+
+CircularBufferConfig& CircularBufferConfig::set_page_size(uint8_t buffer_index, uint32_t page_size) {
+    if (buffer_index > NUM_CIRCULAR_BUFFERS - 1) {
+        TT_THROW(
+            "Buffer index ({}) exceeds max number of circular buffers per core ({})",
+            buffer_index,
+            NUM_CIRCULAR_BUFFERS);
+    }
+    if (this->buffer_indices_.find(buffer_index) == this->buffer_indices_.end()) {
+        TT_THROW(
+            "Illegal circular buffer index {}. Page size can only be specified for buffer indices configured "
+            "during config creation",
+            buffer_index);
+    }
+    if (this->total_size_ % page_size != 0) {
+        TT_THROW("Total circular buffer size {} B must be divisible by page size {} B", this->total_size_, page_size);
+    }
+    if (page_size % sizeof(uint32_t) != 0) {
+        TT_THROW("Page size must be divisible by sizeof(uint32_t) because buffers holds uint32_t values");
+    }
+
+    this->page_sizes_[buffer_index] = page_size;
+    return *this;
+}
+
+CircularBufferConfig& CircularBufferConfig::set_total_size(uint32_t total_size) {
+    if (dynamic_cb_ and total_size > this->max_size_.value()) {
+        TT_THROW(
+            "Cannot grow circular buffer to {} B. This is larger than associated dynamically allocated L1 buffer "
+            "of {} B",
+            total_size,
+            this->max_size_.value());
+    }
+    if (total_size == 0) {
+        TT_THROW("Total size for circular buffer must be non-zero!");
+    }
+    this->total_size_ = total_size;
+    return *this;
+}
+
+CircularBufferConfig& CircularBufferConfig::set_globally_allocated_address(const Buffer &buffer) {
+    if (not buffer.is_l1()) {
+        TT_THROW("Only L1 buffers can have an associated circular buffer!");
+    }
+    this->globally_allocated_address_ = buffer.address();
+    this->dynamic_cb_ = true;
+    this->max_size_ = buffer.size();
+    this->shadow_global_buffer = &buffer;
+    return *this;
+}
+
+CircularBufferConfig& CircularBufferConfig::set_tile_dims(uint8_t buffer_index, const Tile &tile) {
+    this->tiles_[buffer_index] = tile;
+    return *this;
+}
+
+const std::array<std::optional<Tile>, NUM_CIRCULAR_BUFFERS> &CircularBufferConfig::tiles() const {
+    return this->tiles_;
+}
+
+uint32_t CircularBufferConfig::total_size() const { return this->total_size_; }
+
+std::optional<uint32_t> CircularBufferConfig::globally_allocated_address() const {
+    return this->globally_allocated_address_;
+}
+
+const std::array<std::optional<tt::DataFormat>, NUM_CIRCULAR_BUFFERS> &CircularBufferConfig::data_formats() const {
+    return this->data_formats_;
+}
+
+const std::array<std::optional<uint32_t>, NUM_CIRCULAR_BUFFERS> &CircularBufferConfig::page_sizes() const {
+    return this->page_sizes_;
+}
+
+CircularBufferConfig::Builder::Builder(CircularBufferConfig &parent, uint8_t buffer_index) :
+    parent_(parent), buffer_index_(buffer_index) {
+    if (buffer_index > NUM_CIRCULAR_BUFFERS - 1) {
+        TT_THROW(
+            "Buffer index ({}) exceeds max number of circular buffers per core ({})",
+            buffer_index,
+            NUM_CIRCULAR_BUFFERS);
+    }
+    parent_.buffer_indices_.insert(buffer_index_);
+}
+
+const CircularBufferConfig::Builder &CircularBufferConfig::Builder::set_data_format(tt::DataFormat data_format) const {
+    parent_.data_formats_[buffer_index_] = data_format;
+    return *this;
+}
+
+const CircularBufferConfig::Builder &CircularBufferConfig::Builder::add_size(uint32_t size) const{
+    parent_.total_size_ += size;
+    return *this;
+}
+
+const CircularBufferConfig::Builder &CircularBufferConfig::Builder::set_page_size(uint32_t page_size) const{
+    if (parent_.total_size_ % page_size != 0) {
+        TT_THROW("Total circular buffer size {} B must be divisible by page size {} B", parent_.total_size_, page_size);
+    }
+    if (page_size % sizeof(uint32_t) != 0) {
+        TT_THROW("Page size must be divisible by sizeof(uint32_t) because buffers hold uint32_t values");
+    }
+    parent_.page_sizes_[buffer_index_] = page_size;
+    return *this;
+}
+
+const CircularBufferConfig::Builder &CircularBufferConfig::Builder::set_tile_dims(const Tile &tile) const{
+    parent_.tiles_[buffer_index_] = tile;
+    return *this;
+}
+
+CircularBufferConfig::Builder CircularBufferConfig::index(uint8_t buffer_index) { return Builder(*this, buffer_index); }
+
+void CircularBufferConfig::set_config(const std::map<uint8_t, tt::DataFormat> &data_format_spec){
+    if (data_format_spec.size() > NUM_CIRCULAR_BUFFERS) {
+        TT_THROW(
+            "Only {} circular buffer slots are available but data formats are specified for {} indices",
+            NUM_CIRCULAR_BUFFERS,
+            data_format_spec.size());
+    }
+
+    for (const auto &[buffer_index, data_format] : data_format_spec) {
+        if (buffer_index > NUM_CIRCULAR_BUFFERS - 1) {
+            TT_THROW(
+                "Buffer index ({}) exceeds max number of circular buffers per core ({})",
+                buffer_index,
+                NUM_CIRCULAR_BUFFERS);
+        }
+        this->data_formats_[buffer_index] = data_format;
+        this->buffer_indices_.insert(buffer_index);
+    }
+}
+
+bool operator==(const CircularBufferConfig &lhs, const CircularBufferConfig &rhs) {
+    return lhs.total_size() == rhs.total_size() &&
+           lhs.globally_allocated_address() == rhs.globally_allocated_address() &&
+           lhs.data_formats() == rhs.data_formats() &&
+           lhs.page_sizes() == rhs.page_sizes() &&
+           lhs.tiles() == rhs.tiles() &&
+           lhs.shadow_global_buffer == rhs.shadow_global_buffer;
+}
+
+bool operator!=(const CircularBufferConfig &lhs, const CircularBufferConfig &rhs) { return !(lhs == rhs); }
+
+}  // namespace v0
+}  // namespace tt::tt_metal
diff --git a/tt_metal/impl/buffers/circular_buffer_types.hpp b/tt_metal/impl/buffers/circular_buffer_types.hpp
index 512876f7091..b6cf247b4e7 100644
--- a/tt_metal/impl/buffers/circular_buffer_types.hpp
+++ b/tt_metal/impl/buffers/circular_buffer_types.hpp
@@ -1,10 +1,11 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <map>
 #include <optional>
@@ -22,125 +23,63 @@ inline namespace v0 {
 
 using CBHandle = uintptr_t;
 
+
 class CircularBufferConfig {
    public:
     // Static circular buffer spec
-    CircularBufferConfig(uint32_t total_size, const std::map<uint8_t, tt::DataFormat> &data_format_spec) :
-        total_size_(total_size), globally_allocated_address_(std::nullopt), dynamic_cb_(false) {
-        this->set_config(data_format_spec);
-    }
+    CircularBufferConfig(uint32_t total_size, const std::map<uint8_t, tt::DataFormat> &data_format_spec);
+
+    // User is expected to use the builder here.
+    CircularBufferConfig(uint32_t total_size);
 
     // Dynamic circular buffer spec
     CircularBufferConfig(
-        uint32_t total_size, const std::map<uint8_t, tt::DataFormat> &data_format_spec, const Buffer &buffer) :
-        total_size_(total_size),
-        dynamic_cb_(true),
-        max_size_(buffer.size()) {
-        if (not buffer.is_l1()) {
-            TT_THROW("Only L1 buffers can have an associated circular buffer!");
-        }
-        if (total_size > buffer.size()) {
-            TT_THROW(
-                "Requested {} B but dynamic circular buffer cannot be larger than allocated L1 buffer of {} B",
-                total_size,
-                buffer.size());
-        }
-        this->set_globally_allocated_address(buffer);
-        this->set_config(data_format_spec);
-    }
-
-    CircularBufferConfig set_page_size(uint8_t buffer_index, uint32_t page_size) {
-        if (buffer_index > NUM_CIRCULAR_BUFFERS - 1) {
-            TT_THROW(
-                "Buffer index ({}) exceeds max number of circular buffers per core ({})",
-                buffer_index,
-                NUM_CIRCULAR_BUFFERS);
-        }
-        if (this->buffer_indices_.find(buffer_index) == this->buffer_indices_.end()) {
-            TT_THROW(
-                "Illegal circular buffer index {}. Page size can only be specified for buffer indices configured "
-                "during config creation",
-                buffer_index);
-        }
-        if (this->total_size_ % page_size != 0) {
-            TT_THROW(
-                "Total circular buffer size {} B must be divisible by page size {} B",
-                this->total_size_,
-                page_size);
-        }
-        if (page_size % sizeof(uint32_t) != 0) {
-            TT_THROW("Page size must be divisible by sizeof(uint32_t) because buffers holds uint32_t values");
-        }
-
-        this->page_sizes_[buffer_index] = page_size;
-        return *this;
-    }
-
-    CircularBufferConfig set_total_size(uint32_t total_size) {
-        if (dynamic_cb_ and total_size > this->max_size_.value()) {
-            TT_THROW(
-                "Cannot grow circular buffer to {} B. This is larger than associated dynamically allocated L1 buffer "
-                "of {} B",
-                total_size,
-                this->max_size_.value());
-        }
-        if (total_size == 0) {
-            TT_THROW("Total size for circular buffer must be non-zero!");
-        }
-        this->total_size_ = total_size;
-        return *this;
-    }
-
-    CircularBufferConfig set_globally_allocated_address(const Buffer &buffer) {
-        if (not buffer.is_l1()) {
-            TT_THROW("Only L1 buffers can have an associated circular buffer!");
-        }
-        this->globally_allocated_address_ = buffer.address();
-        this->dynamic_cb_ = true;
-        this->max_size_ = buffer.size();
-        this->shadow_global_buffer = &buffer;
-        return *this;
-    }
-
-    CircularBufferConfig set_tile_dims(uint8_t buffer_index, const Tile& tile) {
-        this->tiles_[buffer_index] = tile;
-        return *this;
-    }
-
-    const std::array<std::optional<Tile>, NUM_CIRCULAR_BUFFERS> &tiles() const {
-        return this->tiles_;
-    }
-
-    uint32_t total_size() const { return this->total_size_; }
-
-    std::optional<uint32_t> globally_allocated_address() const { return this->globally_allocated_address_; }
-
-    const std::array<std::optional<tt::DataFormat>, NUM_CIRCULAR_BUFFERS> &data_formats() const {
-        return this->data_formats_;
-    }
-
-    const std::array<std::optional<uint32_t>, NUM_CIRCULAR_BUFFERS> &page_sizes() const { return this->page_sizes_; }
-    const Buffer* shadow_global_buffer;
+        uint32_t total_size, const std::map<uint8_t, tt::DataFormat> &data_format_spec, const Buffer &buffer);
+
+    CircularBufferConfig& set_page_size(uint8_t buffer_index, uint32_t page_size);
+
+    CircularBufferConfig& set_total_size(uint32_t total_size);
+
+    CircularBufferConfig& set_globally_allocated_address(const Buffer &buffer);
+
+    CircularBufferConfig& set_tile_dims(uint8_t buffer_index, const Tile& tile);
+
+    const std::array<std::optional<Tile>, NUM_CIRCULAR_BUFFERS> &tiles() const;
+
+    uint32_t total_size() const;
+
+    std::optional<uint32_t> globally_allocated_address() const;
+
+    const std::array<std::optional<tt::DataFormat>, NUM_CIRCULAR_BUFFERS> &data_formats() const;
+
+    const std::array<std::optional<uint32_t>, NUM_CIRCULAR_BUFFERS> &page_sizes() const;
+    const Buffer* shadow_global_buffer{nullptr};
+
+    class Builder {
+       public:
+        Builder(CircularBufferConfig &parent, uint8_t buffer_index);
+
+        const Builder &set_data_format(tt::DataFormat data_format) const;
+
+        const Builder &add_size(uint32_t size) const;
+
+        const Builder &set_page_size(uint32_t page_size) const;
+
+        const Builder &set_tile_dims(const Tile &tile) const;
+
+       private:
+        CircularBufferConfig &parent_;
+        uint8_t buffer_index_;
+    };
+
+    Builder index(uint8_t buffer_index);
+
+    friend bool operator==(const CircularBufferConfig& lhs, const CircularBufferConfig& rhs);
+    friend bool operator!=(const CircularBufferConfig& lhs, const CircularBufferConfig& rhs);
+
+
    private:
-    void set_config(const std::map<uint8_t, tt::DataFormat> &data_format_spec) {
-        if (data_format_spec.size() > NUM_CIRCULAR_BUFFERS) {
-            TT_THROW(
-                "Only {} circular buffer slots are available but data formats are specified for {} indices",
-                NUM_CIRCULAR_BUFFERS,
-                data_format_spec.size());
-        }
-
-        for (const auto &[buffer_index, data_format] : data_format_spec) {
-            if (buffer_index > NUM_CIRCULAR_BUFFERS - 1) {
-                TT_THROW(
-                    "Buffer index ({}) exceeds max number of circular buffers per core ({})",
-                    buffer_index,
-                    NUM_CIRCULAR_BUFFERS);
-            }
-            this->data_formats_[buffer_index] = data_format;
-            this->buffer_indices_.insert(buffer_index);
-        }
-    }
+    void set_config(const std::map<uint8_t, tt::DataFormat> &data_format_spec);
 
     uint32_t total_size_ = 0;
     std::optional<uint32_t> globally_allocated_address_ = std::nullopt;
@@ -153,5 +92,8 @@ class CircularBufferConfig {
     std::optional<uint32_t> max_size_ = std::nullopt;
 };
 
+bool operator==(const CircularBufferConfig& lhs, const CircularBufferConfig& rhs);
+bool operator!=(const CircularBufferConfig& lhs, const CircularBufferConfig& rhs);
+
 }  // namespace v0
 }  // namespace tt::tt_metal
diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp
index 9fe120696b8..391b63dd0c8 100644
--- a/tt_metal/impl/debug/dprint_server.cpp
+++ b/tt_metal/impl/debug/dprint_server.cpp
@@ -197,7 +197,7 @@ static void PrintTileSlice(ostream& stream, uint8_t* ptr, int hart_id) {
     uint8_t* data = ptr + offsetof(TileSliceHostDev<0>, data);
 
     // Read any error codes and handle accordingly
-    enum CB cb = static_cast<enum CB>(ts->cb_id);
+    enum CBIndex cb = static_cast<enum CBIndex>(ts->cb_id);
     switch (ts->return_code) {
         case DPrintOK:
             break; // Continue to print the tile slice
diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp
index 3b5ab120d0f..4b42ae7fa1a 100644
--- a/tt_metal/impl/device/device.cpp
+++ b/tt_metal/impl/device/device.cpp
@@ -3,8 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <string>
-#include <chrono>
-#include <type_traits>
+#include <thread>
+#include "tt_metal/device.hpp"
+#include "common/core_coord.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/jit_build/genfiles.hpp"
 #include "tt_metal/impl/device/device.hpp"
@@ -14,7 +15,6 @@
 #include "tt_metal/detail/tt_metal.hpp"
 #include "impl/debug/dprint_server.hpp"
 #include "impl/debug/watcher_server.hpp"
-#include "common/env_lib.hpp"
 #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp"
 #include "common/utils.hpp"
 #include "llrt/llrt.hpp"
@@ -28,13 +28,14 @@
 #include "tt_metal/impl/sub_device/sub_device_manager.hpp"
 #include "tt_metal/impl/sub_device/sub_device_types.hpp"
 #include "tt_metal/tt_stl/span.hpp"
+#include "tt_metal/types.hpp"
 
 namespace tt {
 
 namespace tt_metal {
 
 Device::Device(
-    chip_id_t device_id, const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap, bool minimal, uint32_t worker_core, uint32_t completion_queue_reader_core) :
+    chip_id_t device_id, const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap, bool minimal, uint32_t worker_core, uint32_t completion_queue_reader_core) :
     id_(device_id), worker_thread_core(worker_core), completion_queue_reader_core(completion_queue_reader_core), work_executor(worker_core, device_id) {
     ZoneScoped;
     tunnel_device_dispatch_workers_ = {};
@@ -208,7 +209,7 @@ void Device::initialize_cluster() {
     log_info(tt::LogMetal, "AI CLK for device {} is:   {} MHz", this->id_, ai_clk);
 }
 
-void Device::initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap) {
+void Device::initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap) {
     // Create the default sub-device manager representing the entire chip
     this->next_sub_device_manager_id_ = {0};
     auto [sub_device_manager, _] = this->sub_device_managers_.insert_or_assign(this->get_next_sub_device_manager_id(), std::make_unique<detail::SubDeviceManager>(this, this->initialize_allocator(l1_small_size, trace_region_size, l1_bank_remap)));
@@ -220,7 +221,7 @@ void Device::initialize_default_sub_device_state(size_t l1_small_size, size_t tr
 
 }
 
-std::unique_ptr<Allocator> Device::initialize_allocator(size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap) {
+std::unique_ptr<Allocator> Device::initialize_allocator(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap) {
     ZoneScoped;
     const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_);
     CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->id_);
@@ -246,7 +247,7 @@ std::unique_ptr<Allocator> Device::initialize_allocator(size_t l1_small_size, si
          .core_type_from_noc_coord_table = {},  // Populated later
          .worker_log_to_physical_routing_x = soc_desc.worker_log_to_physical_routing_x,
          .worker_log_to_physical_routing_y = soc_desc.worker_log_to_physical_routing_y,
-         .l1_bank_remap = l1_bank_remap,
+         .l1_bank_remap = {l1_bank_remap.begin(), l1_bank_remap.end()},
          .compute_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(compute_size.x - 1, compute_size.y - 1))),
          .alignment = std::max(hal.get_alignment(HalMemType::DRAM), hal.get_alignment(HalMemType::L1)),
          .disable_interleaved = false});
@@ -2930,7 +2931,7 @@ void Device::initialize_synchronous_sw_cmd_queue() {
     }
 }
 
-bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap, bool minimal) {
+bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap, bool minimal) {
     ZoneScoped;
     log_info(tt::LogMetal, "Initializing device {}. Program cache is {}enabled", this->id_, this->program_cache.is_enabled() ? "": "NOT ");
     log_debug(tt::LogMetal, "Running with {} cqs ", num_hw_cqs);
@@ -3665,6 +3666,82 @@ const std::vector<SubDeviceId> &Device::get_sub_device_ids() const {
     return this->active_sub_device_manager_->get_sub_device_ids();
 }
 
+size_t v1::GetNumAvailableDevices() { return tt::Cluster::instance().number_of_user_devices(); }
+
+size_t v1::GetNumPCIeDevices() { return tt::Cluster::instance().number_of_pci_devices(); }
+
+chip_id_t v1::GetPCIeDeviceID(chip_id_t device_id) {
+    return tt::Cluster::instance().get_associated_mmio_device(device_id);
+}
+
+v1::DeviceHandle v1::CreateDevice(chip_id_t device_id, CreateDeviceOptions options) {
+    ZoneScoped;
+
+    tt::DevicePool::initialize(
+        {device_id},
+        options.num_hw_cqs,
+        options.l1_small_size,
+        options.trace_region_size,
+        options.dispatch_core_type,
+        options.l1_bank_remap);
+
+    return tt::DevicePool::instance().get_active_device(device_id);
+}
+
+bool v1::CloseDevice(DeviceHandle device) { return v0::CloseDevice(device); }
+
+void v1::DeallocateBuffers(DeviceHandle device) { device->deallocate_buffers(); }
+
+void v1::DumpDeviceProfileResults(DeviceHandle device, const CoreRangeSet &worker_cores, bool last_dump) {
+    auto worker_cores_vec = corerange_to_cores(worker_cores);
+    detail::DumpDeviceProfileResults(device, worker_cores_vec, last_dump);
+}
+
+ARCH v1::GetArch(DeviceHandle device) { return device->arch(); }
+
+chip_id_t v1::GetId(DeviceHandle device) { return device->id(); }
+
+int v1::GetNumDramChannels(DeviceHandle device) { return device->num_dram_channels(); }
+
+std::uint32_t v1::GetL1SizePerCore(DeviceHandle device) { return device->l1_size_per_core(); }
+
+CoreCoord v1::GetComputeWithStorageGridSize(DeviceHandle device) { return device->compute_with_storage_grid_size(); }
+
+CoreCoord v1::GetDramGridSize(DeviceHandle device) { return device->dram_grid_size(); }
+
+void v1::EnableProgramCache(DeviceHandle device) { device->enable_program_cache(); }
+
+void v1::DisableAndClearProgramCache(DeviceHandle device) { device->disable_and_clear_program_cache(); }
+
+void v1::PushWork(DeviceHandle device, std::function<void()> work, bool blocking) {
+    device->push_work(std::move(work), blocking);
+}
+
+void v1::Synchronize(DeviceHandle device) { device->synchronize(); }
+
+std::vector<CoreCoord> v1::GetEthernetSockets(DeviceHandle device, chip_id_t connected_chip_id) {
+    return device->get_ethernet_sockets(connected_chip_id);
+}
+
+std::uint32_t v1::GetNumBanks(DeviceHandle device, BufferType buffer_type) { return device->num_banks(buffer_type); }
+
+std::int32_t v1::GetBankOffset(DeviceHandle device, BufferType buffer_type, std::uint32_t bank_id) {
+    return device->bank_offset(buffer_type, bank_id);
+}
+
+tt::stl::Span<const std::uint32_t> v1::BankIdsFromLogicalCore(
+    DeviceHandle device, BufferType buffer_type, CoreCoord logical_core) {
+    return device->bank_ids_from_logical_core(buffer_type, logical_core);
+}
+
+float v1::GetSfpuEps(DeviceHandle device) { return device->sfpu_eps(); }
+
+float v1::GetSfpuNan(DeviceHandle device) { return device->sfpu_nan(); }
+
+float v1::GetSfpuInf(DeviceHandle device) { return device->sfpu_inf(); }
+
+std::size_t v1::GetNumProgramCacheEntries(DeviceHandle device) { return device->num_program_cache_entries(); }
+
 }  // namespace tt_metal
 
 }  // namespace tt
diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp
index add6a9535f2..3f48d284858 100644
--- a/tt_metal/impl/device/device.hpp
+++ b/tt_metal/impl/device/device.hpp
@@ -72,7 +72,7 @@ class Device {
         const uint8_t num_hw_cqs,
         std::size_t l1_small_size,
         std::size_t trace_region_size,
-        const std::vector<uint32_t> &l1_bank_remap = {},
+        tt::stl::Span<const std::uint32_t> l1_bank_remap = {},
         bool minimal = false,
         uint32_t worker_core = 0,
         uint32_t completion_queue_reader_core = 0);
@@ -253,9 +253,9 @@ class Device {
 
     // Checks that the given arch is on the given pci_slot and that it's responding
     // Puts device into reset
-    bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap = {}, bool minimal = false);
+    bool initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {}, bool minimal = false);
     void initialize_cluster();
-    std::unique_ptr<Allocator> initialize_allocator(size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap = {});
+    std::unique_ptr<Allocator> initialize_allocator(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap = {});
     void initialize_build();
     void initialize_device_kernel_defines();
     void build_firmware();
@@ -383,7 +383,7 @@ class Device {
     void remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id);
     const std::vector<SubDeviceId> &get_sub_device_ids() const;
    private:
-    void initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, const std::vector<uint32_t> &l1_bank_remap);
+    void initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, tt::stl::Span<const std::uint32_t> l1_bank_remap);
     SubDeviceManagerId get_next_sub_device_manager_id();
     void reset_sub_devices_state(const std::unique_ptr<detail::SubDeviceManager>& sub_device_manager);
     void MarkAllocationsUnsafe();
diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp
index 81121723693..bdbc4a80f1f 100644
--- a/tt_metal/impl/device/device_pool.cpp
+++ b/tt_metal/impl/device/device_pool.cpp
@@ -7,6 +7,8 @@
 #include <numa.h>
 
 #include "tt_metal/detail/tt_metal.hpp"
+#include "tt_metal/impl/debug/noc_logging.hpp"
+#include "tt_metal/impl/debug/watcher_server.hpp"
 #include "tt_metal/impl/device/device_handle.hpp"
 
 using namespace tt::tt_metal;
@@ -179,19 +181,19 @@ void DevicePool::initialize(
     size_t l1_small_size,
     size_t trace_region_size,
     DispatchCoreType dispatch_core_type,
-    const std::vector<uint32_t> &l1_bank_remap) noexcept {
+    tt::stl::Span<const std::uint32_t> l1_bank_remap) noexcept {
     ZoneScoped;
     log_debug(tt::LogMetal, "DevicePool initialize");
     tt::tt_metal::dispatch_core_manager::initialize(dispatch_core_type, num_hw_cqs);
 
     if (_inst == nullptr) {
-        static DevicePool device_pool(device_ids, num_hw_cqs, l1_small_size, trace_region_size, l1_bank_remap);
+        static DevicePool device_pool{};
         _inst = &device_pool;
     }
     _inst->l1_small_size = l1_small_size;
     _inst->trace_region_size = trace_region_size;
     _inst->num_hw_cqs = num_hw_cqs;
-    _inst->l1_bank_remap = l1_bank_remap;
+    _inst->l1_bank_remap.assign(l1_bank_remap.begin(), l1_bank_remap.end());
     // Track the thread where the Device Pool was created. Certain functions
     // modifying the state of this instance, for example those responsible for
     // (un)registering worker threads, can only be called in the creation thread
@@ -388,12 +390,7 @@ void DevicePool::init_firmware_on_active_devices() const {
     }
 }
 
-DevicePool::DevicePool(
-    std::vector<chip_id_t> device_ids,
-    const uint8_t num_hw_cqs,
-    size_t l1_small_size,
-    size_t trace_region_size,
-    const std::vector<uint32_t>& l1_bank_remap) {
+DevicePool::DevicePool() {
     ZoneScoped;
     log_debug(tt::LogMetal, "DevicePool constructor");
     bool use_numa_node_based_thread_binding = parse_env("TT_METAL_NUMA_BASED_AFFINITY", false);
diff --git a/tt_metal/impl/device/device_pool.hpp b/tt_metal/impl/device/device_pool.hpp
index 87b928d784e..e57e06ab839 100644
--- a/tt_metal/impl/device/device_pool.hpp
+++ b/tt_metal/impl/device/device_pool.hpp
@@ -4,13 +4,11 @@
 
 #pragma once
 
+#include "tt_cluster_descriptor_types.h"
 #include "tt_metal/host_api.hpp"
 #include "impl/debug/dprint_server.hpp"
-#include "impl/debug/noc_logging.hpp"
-#include "impl/debug/watcher_server.hpp"
 #include "tt_metal/impl/device/device.hpp"
 #include "tt_metal/impl/device/device_handle.hpp"
-#include "tt_metal/third_party/umd/device/tt_cluster_descriptor.h"
 namespace tt {
 namespace tt_metal::detail {
 
@@ -41,7 +39,7 @@ class DevicePool {
         size_t l1_small_size,
         size_t trace_region_size,
         tt_metal::DispatchCoreType dispatch_core_type,
-        const std::vector<uint32_t> &l1_bank_remap = {}) noexcept;
+        tt::stl::Span<const std::uint32_t> l1_bank_remap = {}) noexcept;
 
     tt_metal::v1::DeviceHandle get_active_device(chip_id_t device_id) const;
     std::vector<tt_metal::v1::DeviceHandle> get_all_active_devices() const;
@@ -53,12 +51,7 @@ class DevicePool {
     const std::unordered_set<std::thread::id>& get_worker_thread_ids() const;
    private:
     ~DevicePool();
-    DevicePool(
-        std::vector<chip_id_t> device_ids,
-        const uint8_t num_hw_cqs,
-        size_t l1_small_size,
-        size_t trace_region_size,
-        const std::vector<uint32_t> &l1_bank_remap);
+    DevicePool();
     uint8_t num_hw_cqs;
     size_t l1_small_size;
     size_t trace_region_size;
diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index fc977b661de..925b0279341 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -16,8 +16,10 @@
 #include "allocator/allocator.hpp"
 #include "debug_tools.hpp"
 #include "dev_msgs.h"
+#include "device/device_handle.hpp"
 #include "llrt/hal.hpp"
 #include "noc/noc_parameters.h"
+#include "tt_metal/command_queue.hpp"
 #include "tt_metal/common/assert.hpp"
 #include "tt_metal/common/logger.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
@@ -3482,6 +3484,40 @@ void CommandQueue::run_command_impl(const CommandInterface& command) {
     log_trace(LogDispatch, "{} running {} complete", this->name(), command.type);
 }
 
+v1::CommandQueueHandle v1::GetCommandQueue(DeviceHandle device, std::uint8_t cq_id) {
+    return v1::CommandQueueHandle{device, cq_id};
+}
+
+v1::CommandQueueHandle v1::GetDefaultCommandQueue(DeviceHandle device) { return GetCommandQueue(device, 0); }
+
+void v1::EnqueueReadBuffer(CommandQueueHandle cq, BufferHandle buffer, std::byte *dst, bool blocking) {
+    v0::EnqueueReadBuffer(GetDevice(cq)->command_queue(GetId(cq)), *buffer, dst, blocking);
+}
+
+void v1::EnqueueWriteBuffer(CommandQueueHandle cq, BufferHandle buffer, const std::byte *src, bool blocking) {
+    v0::EnqueueWriteBuffer(GetDevice(cq)->command_queue(GetId(cq)), *buffer, src, blocking);
+}
+
+void v1::EnqueueProgram(CommandQueueHandle cq, ProgramHandle &program, bool blocking) {
+    v0::EnqueueProgram(GetDevice(cq)->command_queue(GetId(cq)), program, blocking);
+}
+
+void v1::Finish(CommandQueueHandle cq, tt::stl::Span<const SubDeviceId> sub_device_ids) {
+    v0::Finish(GetDevice(cq)->command_queue(GetId(cq)));
+}
+
+void v1::SetLazyCommandQueueMode(bool lazy) {
+    detail::SetLazyCommandQueueMode(lazy);
+}
+
+v1::DeviceHandle v1::GetDevice(CommandQueueHandle cq) {
+    return cq.device;
+}
+
+std::uint8_t v1::GetId(CommandQueueHandle cq) {
+    return cq.id;
+}
+
 }  // namespace tt::tt_metal
 
 std::ostream& operator<<(std::ostream& os, EnqueueCommandType const& type) {
diff --git a/tt_metal/impl/event/event.cpp b/tt_metal/impl/event/event.cpp
new file mode 100644
index 00000000000..17a877e3708
--- /dev/null
+++ b/tt_metal/impl/event/event.cpp
@@ -0,0 +1,57 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tt_metal/impl/event/event.hpp"
+
+#include <thread>
+
+#include "tt_metal/common/assert.hpp"
+#include "tt_metal/common/logger.hpp"
+#include "tt_metal/event.hpp"
+
+namespace tt::tt_metal {
+
+void Event::wait_until_ready() {
+    while (!ready) {
+        std::this_thread::sleep_for(std::chrono::microseconds(10));
+        log_trace(
+            tt::LogMetal,
+            "Waiting for Event to be ready. (ready: {} cq_id: {} event_id: {})",
+            bool(ready),
+            cq_id,
+            event_id);
+    }
+
+    TT_ASSERT(device != nullptr, "Event must have initialized device ptr");
+    TT_ASSERT(event_id != -1, "Event must have initialized event_id");
+    TT_ASSERT(cq_id != -1, "Event must have initialized cq_id");
+}
+
+v1::EventHandle::EventHandle() : EventHandle(std::make_shared<Event>()) {}
+
+v1::EventHandle v1::EnqueueRecordEvent(CommandQueueHandle cq) {
+    EventHandle event{};
+    v0::EnqueueRecordEvent(
+        GetDevice(cq)->command_queue(GetId(cq)), static_cast<const std::shared_ptr<v0::Event> &>(event));
+    return event;
+}
+
+void v1::EnqueueWaitForEvent(CommandQueueHandle cq, EventHandle event) {
+    v0::EnqueueWaitForEvent(
+        GetDevice(cq)->command_queue(GetId(cq)), static_cast<const std::shared_ptr<v0::Event> &>(event));
+}
+
+void v1::EventSynchronize(EventHandle event) {
+    v0::EventSynchronize(static_cast<const std::shared_ptr<v0::Event> &>(event));
+}
+
+bool v1::EventQuery(EventHandle event) {
+    return v0::EventQuery(static_cast<const std::shared_ptr<v0::Event> &>(event));
+}
+
+void v1::DeviceSynchronize(DeviceHandle device) { v0::Synchronize(device); }
+
+void v1::CommandQueueSynchronize(CommandQueueHandle cq) { v0::Synchronize(GetDevice(cq), GetId(cq)); }
+
+}  // namespace tt::tt_metal
diff --git a/tt_metal/impl/event/event.hpp b/tt_metal/impl/event/event.hpp
index 17503153a30..80c21f85544 100644
--- a/tt_metal/impl/event/event.hpp
+++ b/tt_metal/impl/event/event.hpp
@@ -4,32 +4,24 @@
 
 #pragma once
 
+#include <atomic>
 #include <cstdint>
-#include <thread>
-#include "tt_metal/common/assert.hpp"
-#include "tt_metal/common/logger.hpp"
+
 namespace tt::tt_metal {
 inline namespace v0 {
-    class Device;
-    struct Event
-    {
-        Device * device = nullptr;
-        uint32_t cq_id = -1;
-        uint32_t event_id = -1;
-        std::atomic<bool> ready = false; // Event is ready for use.
 
-        // With async CQ, must wait until event is populated by child thread before using.
-        // Opened #5988 to track removing this, and finding different solution.
-        void wait_until_ready() {
-            while (!ready) {
-                std::this_thread::sleep_for(std::chrono::microseconds(10));
-                log_trace(tt::LogMetal, "Waiting for Event to be ready. (ready: {} cq_id: {} event_id: {})", bool(ready), cq_id, event_id);
-            }
+class Device;
+
+struct Event {
+    Device *device = nullptr;
+    uint32_t cq_id = -1;
+    uint32_t event_id = -1;
+    std::atomic<bool> ready = false;  // Event is ready for use.
+
+    // With async CQ, must wait until event is populated by child thread before using.
+    // Opened #5988 to track removing this, and finding different solution.
+    void wait_until_ready();
+};
 
-            TT_ASSERT(device != nullptr, "Event must have initialized device ptr");
-            TT_ASSERT(event_id != -1, "Event must have initialized event_id");
-            TT_ASSERT(cq_id != -1, "Event must have initialized cq_id");
-        }
-    };
 }  // namespace v0
 }  // namespace tt::tt_metal
diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp
index df5c6478a75..02f2ecf12e1 100644
--- a/tt_metal/impl/kernels/kernel.cpp
+++ b/tt_metal/impl/kernels/kernel.cpp
@@ -13,7 +13,7 @@
 #include "llrt/llrt.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/impl/debug/watcher_server.hpp"
-#include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp"
+#include "tt_metal/kernel.hpp"
 #include "tt_metal/common/utils.hpp"
 #include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/jit_build/genfiles.hpp"
@@ -502,6 +502,39 @@ std::ostream &operator<<(std::ostream &os, const DataMovementProcessor &processo
     return os;
 }
 
+void v1::SetRuntimeArgs(
+    ProgramHandle &program, KernelHandle kernel, const CoreRangeSet &core_spec, RuntimeArgs runtime_args) {
+    if (runtime_args.empty()) {
+        return;
+    }
+
+    const auto kernel_ptr = detail::GetKernel(program, static_cast<tt_metal::KernelHandle>(kernel));
+
+    for (const auto &core_range : core_spec.ranges()) {
+        for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; ++x) {
+            for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; ++y) {
+                kernel_ptr->set_runtime_args(CoreCoord(x, y), runtime_args);
+            }
+        }
+    }
+}
+
+void v1::SetCommonRuntimeArgs(ProgramHandle &program, KernelHandle kernel, RuntimeArgs runtime_args) {
+    if (runtime_args.empty()) {
+        return;
+    }
+
+    const auto kernel_ptr = detail::GetKernel(program, static_cast<tt_metal::KernelHandle>(kernel));
+
+    kernel_ptr->set_common_runtime_args(runtime_args);
+}
+
+v1::RuntimeArgs v1::GetRuntimeArgs(ProgramHandle &program, KernelHandle kernel, CoreCoord logical_core) {
+    const auto kernel_ptr = detail::GetKernel(program, static_cast<tt_metal::KernelHandle>(kernel));
+
+    return kernel_ptr->runtime_args(logical_core);
+}
+
 }  // namespace tt_metal
 
 }  // namespace tt
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index c5cdacadfd4..d61f9c5db9d 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -842,7 +842,7 @@ void detail::Program_::set_cb_data_fmt(Device *device, const std::vector<CoreRan
         for (const auto& circular_buffer : cbs_on_core) {
             for (auto buffer_index : circular_buffer->buffer_indices()) {
                 build_options.set_cb_dataformat_all_cores(
-                    static_cast<CB>(buffer_index), circular_buffer->data_format(buffer_index));
+                    static_cast<CBIndex>(buffer_index), circular_buffer->data_format(buffer_index));
             }
         }
     }
@@ -857,7 +857,7 @@ void detail::Program_::set_cb_tile_dims(Device *device, const std::vector<CoreRa
                 auto tile = circular_buffer->tile(buffer_index);
                 if (tile.has_value()) {
                     build_options.set_cb_tile_dims_all_cores(
-                        static_cast<CB>(buffer_index),
+                        static_cast<CBIndex>(buffer_index),
                         tile->get_num_faces(),
                         tile->get_partial_face(),
                         tile->get_face_shape()[0],
@@ -865,12 +865,12 @@ void detail::Program_::set_cb_tile_dims(Device *device, const std::vector<CoreRa
                         tile->get_tile_shape()[0],
                         tile->get_tile_shape()[1]);
                     build_options.set_cb_tile_size_all_cores(
-                        static_cast<CB>(buffer_index),
+                        static_cast<CBIndex>(buffer_index),
                         tile->get_tile_size(circular_buffer->data_format(buffer_index)));
                 } else {
                     Tile t;
                     build_options.set_cb_tile_size_all_cores(
-                        static_cast<CB>(buffer_index),
+                        static_cast<CBIndex>(buffer_index),
                         t.get_tile_size(circular_buffer->data_format(buffer_index)));
                 }
 
diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp
index 53fe92d38bc..63b842c1a00 100644
--- a/tt_metal/impl/program/program.hpp
+++ b/tt_metal/impl/program/program.hpp
@@ -108,6 +108,8 @@ class Program {
 
     const std::vector<std::shared_ptr<CircularBuffer>> &circular_buffers() const;
 
+    const std::size_t num_circular_buffers() const { return circular_buffers().size();};
+
     const std::vector< Semaphore > & semaphores() const;
 
     KernelGroup * kernels_on_core(const CoreCoord &core, uint32_t programmable_core_type_index);
diff --git a/tt_metal/impl/trace/trace.cpp b/tt_metal/impl/trace/trace.cpp
index 3ed36350c2d..d08826d30f7 100644
--- a/tt_metal/impl/trace/trace.cpp
+++ b/tt_metal/impl/trace/trace.cpp
@@ -5,15 +5,14 @@
 #include "impl/trace/trace.hpp"
 
 #include <memory>
-#include <string>
 
-#include "dispatch/device_command.hpp"
 #include "tt_metal/common/logger.hpp"
 #include "tt_metal/detail/tt_metal.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/device/device.hpp"
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "tt_metal/impl/trace/trace.hpp"
+#include "tt_metal/trace.hpp"
 
 namespace {
 // Labels to make the code more readable
@@ -108,4 +107,31 @@ void Trace::validate_instance(const TraceBuffer& trace_buffer) {
     // add more checks
 }
 
+v1::CommandQueueHandle v1::GetCommandQueue(TraceHandle trace) { return trace.cq; }
+
+v1::TraceHandle v1::BeginTraceCapture(CommandQueueHandle cq) {
+    const auto tid = v0::BeginTraceCapture(GetDevice(cq), GetId(cq));
+    return v1::TraceHandle{cq, tid};
+}
+
+void v1::EndTraceCapture(TraceHandle trace) {
+    const auto cq = GetCommandQueue(trace);
+    v0::EndTraceCapture(GetDevice(cq), GetId(cq), static_cast<std::uint32_t>(trace));
+}
+
+void v1::ReplayTrace(TraceHandle trace, bool blocking) {
+    const auto cq = GetCommandQueue(trace);
+    v0::ReplayTrace(GetDevice(cq), GetId(cq), static_cast<std::uint32_t>(trace), blocking);
+}
+
+void v1::ReleaseTrace(TraceHandle trace) {
+    const auto cq = GetCommandQueue(trace);
+    v0::ReleaseTrace(GetDevice(cq), static_cast<std::uint32_t>(trace));
+}
+
+void v1::EnqueueTrace(TraceHandle trace, bool blocking) {
+    const auto cq = GetCommandQueue(trace);
+    v0::EnqueueTrace(GetDevice(cq)->command_queue(GetId(cq)), static_cast<std::uint32_t>(trace), blocking);
+}
+
 }  // namespace tt::tt_metal
diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h b/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h
index 7c47797c6b6..4777532819c 100644
--- a/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h
+++ b/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h
@@ -31,8 +31,8 @@ ALWI void unary_op_init_common(uint32_t icb, uint32_t ocb = 16)
     MATH(( llk_math_hw_configure_disaggregated(icb, icb) ));
 }
 
-ALWI void init_sfpu(uint32_t icb) {
-    unary_op_init_common(icb);
+ALWI void init_sfpu(uint32_t icb, uint32_t ocb) {
+    unary_op_init_common(icb, ocb);
 }
 
 } // namespace ckernel
diff --git a/tt_metal/include/compute_kernel_api/mask.h b/tt_metal/include/compute_kernel_api/mask.h
index cdd8505c5ea..4ea9c8071df 100644
--- a/tt_metal/include/compute_kernel_api/mask.h
+++ b/tt_metal/include/compute_kernel_api/mask.h
@@ -6,7 +6,6 @@
 
 #pragma once
 
-
 #include "compute_kernel_api/common_globals.h"
 #ifdef TRISC_MATH
 #include "llk_math_eltwise_unary_sfpu_mask.h"
@@ -16,19 +15,18 @@
 #define MATH(x)
 #endif
 
-
-
 namespace ckernel {
 
 ALWI void mask_tile_init() {
-    MATH(( llk_math_eltwise_unary_sfpu_mask_init<true>() )); // TODO(AP): move out init
+    MATH((llk_math_eltwise_unary_sfpu_mask_init<true>()));  // TODO(AP): move out init
 }
 
 /**
  * Performs element-wise computation of mask on each element of a tile
- * in data and mask DST register. The DST register buffer must be in
- * acquired state via *acquire_dst* call. This call is blocking and is only
- * available on the compute engine.
+ * in data and mask DST register. *mask_tile* will mask each element with 0,
+ * *mask_posinf_tile* will mask each element with *float(inf)*.
+ * The DST register buffer must be in acquired state via *acquire_dst* call.
+ * This call is blocking and is only available on the compute engine.
  *
  *
  * TODO: fix idst2_mask.
@@ -37,14 +35,21 @@ ALWI void mask_tile_init() {
  *
  * Return value: None
  *
- * | Argument       | Description                                                                | Type     | Valid Range                                           | Required |
+ * | Argument       | Description                                                                | Type     | Valid
+ * Range                                           | Required |
  * |----------------|----------------------------------------------------------------------------|----------|-------------------------------------------------------|----------|
- * | dst_data_index | The index of the tile in DST REG for the data and result                   | uint32_t | Must be less than the acquired size of DST REG        | True     |
- * | dst_mask_index | The index of the tile in DST REG for the mask                              | uint32_t | Must be less than the acquired size of DST REG        | True     |
- * | data_format    | The format of the data and mask (supports Float16, Float16_b, and Int32)   | DataFormat | Must be a valid data format                         | False    |
+ * | dst_data_index | The index of the tile in DST REG for the data and result                   | uint32_t | Must be
+ * less than the acquired size of DST REG        | True     | | dst_mask_index | The index of the tile in DST REG for
+ * the mask                              | uint32_t | Must be less than the acquired size of DST REG        | True     |
+ * | data_format    | The format of the data and mask (supports Float16, Float16_b, and Int32)   | DataFormat | Must be
+ * a valid data format                         | False    |
  */
 ALWI void mask_tile(uint32_t idst_data, uint32_t idst2_mask, DataFormat data_format = DataFormat::Float16_b) {
-    MATH(( llk_math_eltwise_unary_sfpu_mask<true>(idst_data, data_format) ));
+    MATH((llk_math_eltwise_unary_sfpu_mask<true>(idst_data, data_format)));
+}
+
+ALWI void mask_posinf_tile(uint32_t idst_data, uint32_t idst2_mask) {
+    MATH((llk_math_eltwise_unary_sfpu_mask_posinf<true>(idst_data)));
 }
 
-} // namespace ckernel
+}  // namespace ckernel
diff --git a/tt_metal/include/compute_kernel_api/reg_api.h b/tt_metal/include/compute_kernel_api/reg_api.h
index 7d431ca2ce3..0406990d25c 100644
--- a/tt_metal/include/compute_kernel_api/reg_api.h
+++ b/tt_metal/include/compute_kernel_api/reg_api.h
@@ -9,6 +9,9 @@
 namespace ckernel {
 
 /**
+ * @deprecated This function is deprecated, please use `tile_regs_acquire()` instead.
+ * See https://github.com/tenstorrent/tt-metal/issues/5868#issuecomment-2101726935
+ *
  * Acquires an exclusive lock on the internal DST register for the current
  * Tensix core.
  *
@@ -21,6 +24,7 @@ namespace ckernel {
  *
  * How the destination register will be shared and synchronized between TRISC threads will depend on the compute kernel configuration.
  */
+[[deprecated("Use tile_regs_acquire() instead")]]
 ALWI void acquire_dst() {
     MATH(( llk_math_wait_for_dest_available()  ));
 
@@ -48,6 +52,9 @@ ALWI void tile_regs_wait() {
 }
 
 /**
+ * @deprecated This function is deprecated, please use `tile_regs_release()` instead.
+ * See https://github.com/tenstorrent/tt-metal/issues/5868#issuecomment-2101726935
+ *
  * Releases the exclusive lock on the internal DST register for the current
  * Tensix core. This lock had to be previously acquired with acquire_dst. This
  * call is blocking and is only available on the compute engine.
@@ -56,6 +63,7 @@ ALWI void tile_regs_wait() {
  *
  * How the destination register will be shared and synchronized between TRISC threads will depend on the compute kernel configuration.
  */
+[[deprecated("Use tile_regs_release() instead")]]
 ALWI void release_dst() {
     MATH(( llk_math_dest_section_done()  ));
 
diff --git a/tt_metal/include/tt_metal/buffer.hpp b/tt_metal/include/tt_metal/buffer.hpp
index efb44bb7f8c..04a2d93b819 100644
--- a/tt_metal/include/tt_metal/buffer.hpp
+++ b/tt_metal/include/tt_metal/buffer.hpp
@@ -3,38 +3,30 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include "types.hpp"
+
 #include "tt_metal/impl/buffers/buffer.hpp"
+#include "tt_metal/types.hpp"
 //==================================================
 //                  BUFFER HANDLING
 //==================================================
 
-namespace tt::tt_metal{
+namespace tt::tt_metal {
 namespace v1 {
 
-
 /**
  * @brief Allocates an interleaved DRAM or L1 buffer on the device.
  *
  * @param config Configuration for the buffer.
  * @return Buffer handle to the allocated buffer.
  */
-Buffer CreateBuffer(const InterleavedBufferConfig &config);
-
-/**
- * @brief Allocates a buffer on the device.
- *
- * @param buffer The buffer to allocate.
- * @param bottom_up If true, allocates buffer from the bottom up.
- */
-void AllocateBuffer(Buffer buffer, bool bottom_up);
+BufferHandle CreateBuffer(InterleavedBufferConfig config);
 
 /**
  * @brief Deallocates a buffer from the device.
  *
  * @param buffer The buffer to deallocate.
  */
-void DeallocateBuffer(Buffer buffer);
+void DeallocateBuffer(BufferHandle buffer);
 
 /**
  * @brief Copies data from a host buffer into the specified device buffer.
@@ -42,7 +34,7 @@ void DeallocateBuffer(Buffer buffer);
  * @param buffer Buffer to write data into.
  * @param host_buffer Host buffer containing data to copy.
  */
-void WriteToBuffer(Buffer buffer, const std::vector<uint32_t> &host_buffer);
+void WriteToBuffer(BufferHandle buffer, stl::Span<const std::byte> host_buffer);
 
 /**
  * @brief Copies data from a device buffer into a host buffer.
@@ -51,7 +43,7 @@ void WriteToBuffer(Buffer buffer, const std::vector<uint32_t> &host_buffer);
  * @param host_buffer Host buffer to copy data into.
  * @param shard_order If true, reads data in shard order.
  */
-void ReadFromBuffer(Buffer buffer, std::vector<uint32_t> &host_buffer, bool shard_order = false);
+void ReadFromBuffer(BufferHandle buffer, stl::Span<std::byte> host_buffer, bool shard_order = false);
 
 /**
  * @brief Copies data from a specific shard of a device buffer into a host buffer.
@@ -60,8 +52,7 @@ void ReadFromBuffer(Buffer buffer, std::vector<uint32_t> &host_buffer, bool shar
  * @param host_buffer Host buffer to copy data into.
  * @param core_id ID of the core shard to read.
  */
-void ReadFromShard(Buffer buffer, std::vector<uint32_t> &host_buffer, const uint32_t &core_id);
-
+void ReadFromShard(BufferHandle buffer, stl::Span<std::byte> host_buffer, std::uint32_t core_id);
 
-} // namespace v1
-} // namespace tt::tt_metal
+}  // namespace v1
+}  // namespace tt::tt_metal
diff --git a/tt_metal/include/tt_metal/command_queue.hpp b/tt_metal/include/tt_metal/command_queue.hpp
index e0d9f24149a..9ae1c53af50 100644
--- a/tt_metal/include/tt_metal/command_queue.hpp
+++ b/tt_metal/include/tt_metal/command_queue.hpp
@@ -6,8 +6,6 @@
 
 #include "types.hpp"
 
-#include "tt_metal/impl/buffers/buffer.hpp"
-
 //==================================================
 //                COMMAND QUEUE OPERATIONS
 //==================================================
@@ -16,6 +14,23 @@
 namespace tt::tt_metal{
 namespace v1 {
 
+/**
+ * @brief Retrieves a command queue from the device for a given queue ID.
+ *
+ * @param device The device to query.
+ * @param cq_id The command queue ID.
+ * @return CommandQueue handle.
+ */
+CommandQueueHandle GetCommandQueue(DeviceHandle device, std::uint8_t cq_id);
+
+/**
+ * @brief Retrieves the default command queue for the given device.
+ *
+ * @param device The device to query.
+ * @return CommandQueue handle.
+ */
+CommandQueueHandle GetDefaultCommandQueue(DeviceHandle device);
+
 /**
  * @brief Reads a buffer from the device.
  *
@@ -25,8 +40,8 @@ namespace v1 {
  * @param blocking Indicates whether the operation is blocking.
  */
 void EnqueueReadBuffer(
-    CommandQueue cq,
-    Buffer buffer,
+    CommandQueueHandle cq,
+    BufferHandle buffer,
     std::byte *dst,
     bool blocking);
 
@@ -39,8 +54,8 @@ void EnqueueReadBuffer(
  * @param blocking Indicates whether the operation is blocking.
  */
 void EnqueueWriteBuffer(
-    CommandQueue cq,
-    Buffer buffer,
+    CommandQueueHandle cq,
+    BufferHandle buffer,
     const std::byte *src,
     bool blocking);
 
@@ -52,14 +67,15 @@ void EnqueueWriteBuffer(
  * @param program The program to execute on the device.
  * @param blocking Indicates whether the operation is blocking.
  */
-void EnqueueProgram(CommandQueue cq, Program program, bool blocking);
+void EnqueueProgram(CommandQueueHandle cq, ProgramHandle &program, bool blocking);
 
 /**
  * @brief Blocks until all previously dispatched commands on the device have completed.
  *
  * @param cq The command queue to wait on.
+ * @param sub_device_ids The sub-device ids to wait for completion on. If empty, waits for all sub-devices.
  */
-void Finish(CommandQueue cq);
+void Finish(CommandQueueHandle cq, tt::stl::Span<const SubDeviceId> sub_device_ids = {});
 
 
 /**
@@ -76,7 +92,7 @@ void SetLazyCommandQueueMode(bool lazy);
  * @param cq The command queue to query.
  * @return Device handle associated with the command queue.
  */
-Device GetDevice(class CommandQueue cq);
+DeviceHandle GetDevice(CommandQueueHandle cq);
 
 /**
  * @brief Retrieves the ID of the command queue.
@@ -84,7 +100,7 @@ Device GetDevice(class CommandQueue cq);
  * @param cq The command queue to query.
  * @return ID of the command queue.
  */
-uint32_t GetId(class CommandQueue cq);
+std::uint8_t GetId(CommandQueueHandle cq);
 
 } // namespace v1
 } // namespace tt::tt_metal
diff --git a/tt_metal/include/tt_metal/device.hpp b/tt_metal/include/tt_metal/device.hpp
index 90b9f77bada..c07dcd7e961 100644
--- a/tt_metal/include/tt_metal/device.hpp
+++ b/tt_metal/include/tt_metal/device.hpp
@@ -5,16 +5,16 @@
 #pragma once
 
 #include <cstddef>
-#include "types.hpp"
+
 #include "tt_metal/impl/buffers/buffer_constants.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
 #include "tt_metal/impl/dispatch/work_executor.hpp"
+#include "tt_metal/types.hpp"
 
 //==================================================
 //               DEVICE MANAGEMENT
 //==================================================
 
-namespace tt::tt_metal{
+namespace tt::tt_metal {
 namespace v1 {
 
 /**
@@ -22,14 +22,14 @@ namespace v1 {
  *
  * @return Size_t representing the number of available devices.
  */
-size_t GetNumAvailableDevices();
+std::size_t GetNumAvailableDevices();
 
 /**
  * @brief Returns the number of Tenstorrent devices connected via PCIe.
  *
  * @return Size_t representing the number of PCIe devices.
  */
-size_t GetNumPCIeDevices();
+std::size_t GetNumPCIeDevices();
 
 /**
  * @brief Retrieves the PCIe device ID for a given device ID.
@@ -39,22 +39,39 @@ size_t GetNumPCIeDevices();
  */
 chip_id_t GetPCIeDeviceID(chip_id_t device_id);
 
+/**
+ * Configuration options for CreateDevice
+ */
+struct CreateDeviceOptions {
+    /**
+     * Number of hardware command queues (default: 1, valid range: 1 to 2).
+     */
+    uint8_t num_hw_cqs = 1;
+    /**
+     * L1 small space to reserve (default: DEFAULT_L1_SMALL_SIZE).
+     */
+    std::size_t l1_small_size = DEFAULT_L1_SMALL_SIZE;
+    /**
+     * Trace region size to reserve (default: DEFAULT_TRACE_REGION_SIZE).
+     */
+    std::size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE;
+    /**
+     * Dispatch core type to use (default: DispatchCoreType::WORKER).
+     */
+    DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER;
+    /**
+     * For shuffling bank id offsets
+     */
+    stl::Span<const std::uint32_t> l1_bank_remap = {};
+};
+
 /**
  * @brief Instantiates a Device object.
  *
- * @param device_id ID of the device to target (0 to GetNumAvailableDevices() - 1).
- * @param num_hw_cqs Number of hardware command queues (default: 1, valid range: 1 to 2).
- * @param l1_small_size L1 small space to reserve (default: DEFAULT_L1_SMALL_SIZE).
- * @param trace_region_size Trace region size to reserve (default: DEFAULT_TRACE_REGION_SIZE).
- * @param dispatch_core_type Dispatch core type to use (default: DispatchCoreType::WORKER).
+ * @param options Configuration parameter for requested device
  * @return Device handle to the created device.
  */
-Device CreateDevice(
-    chip_id_t device_id,
-    uint8_t num_hw_cqs = 1,
-    size_t l1_small_size = DEFAULT_L1_SMALL_SIZE,
-    size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE,
-    DispatchCoreType dispatch_core_type = DispatchCoreType::WORKER);
+DeviceHandle CreateDevice(chip_id_t device_id, CreateDeviceOptions options = {});
 
 /**
  * @brief Resets and closes the device.
@@ -62,14 +79,12 @@ Device CreateDevice(
  * @param device Handle to the device to close.
  * @return True if the device was successfully closed; otherwise, false.
  */
-bool CloseDevice(Device device);
-
+bool CloseDevice(DeviceHandle device);
 
 /**
  * @brief Deallocates all buffers on the device.
  */
-void DeallocateBuffers(Device device);
-
+void DeallocateBuffers(DeviceHandle device);
 
 /**
  * @brief Dumps device-side profiler data to a CSV log.
@@ -78,8 +93,7 @@ void DeallocateBuffers(Device device);
  * @param worker_cores CoreRangeSet of worker cores being profiled.
  * @param last_dump If true, indicates the last dump before process termination.
  */
-void DumpDeviceProfileResults(Device device, const CoreRangeSet &worker_cores, bool last_dump = false);
-
+void DumpDeviceProfileResults(DeviceHandle device, const CoreRangeSet &worker_cores, bool last_dump = false);
 
 /**
  * @brief Retrieves the architecture of the device.
@@ -87,7 +101,7 @@ void DumpDeviceProfileResults(Device device, const CoreRangeSet &worker_cores, b
  * @param device The device to query.
  * @return ARCH representing the device architecture.
  */
-ARCH GetArch(Device device);
+ARCH GetArch(DeviceHandle device);
 
 /**
  * @brief Retrieves the ID of the device.
@@ -95,7 +109,7 @@ ARCH GetArch(Device device);
  * @param device The device to query.
  * @return Chip ID of the device.
  */
-chip_id_t GetId(Device device);
+chip_id_t GetId(DeviceHandle device);
 
 /**
  * @brief Retrieves the number of DRAM channels on the device.
@@ -103,7 +117,7 @@ chip_id_t GetId(Device device);
  * @param device The device to query.
  * @return Number of DRAM channels.
  */
-int GetNumDramChannels(Device device);
+int GetNumDramChannels(DeviceHandle device);
 
 /**
  * @brief Retrieves the available L1 size per worker core on the device.
@@ -111,7 +125,7 @@ int GetNumDramChannels(Device device);
  * @param device The device to query.
  * @return L1 size per core in bytes.
  */
-uint32_t GetL1SizePerCore(Device device);
+std::uint32_t GetL1SizePerCore(DeviceHandle device);
 
 /**
  * @brief Computes the storage grid size for the device.
@@ -119,7 +133,7 @@ uint32_t GetL1SizePerCore(Device device);
  * @param device The device to query.
  * @return CoreCoord representing the storage grid size.
  */
-CoreCoord GetComputeWithStorageGridSize(Device device);
+CoreCoord GetComputeWithStorageGridSize(DeviceHandle device);
 
 /**
  * @brief Retrieves the DRAM grid size for the device.
@@ -127,49 +141,21 @@ CoreCoord GetComputeWithStorageGridSize(Device device);
  * @param device The device to query.
  * @return CoreCoord representing the DRAM grid size.
  */
-CoreCoord GetDramGridSize(Device device);
-
-/**
- * @brief Converts a logical core coordinate to a physical core coordinate.
- *
- * @param device The device to query.
- * @param logical_core The logical core coordinate.
- * @param core_type The type of the core.
- * @return CoreCoord representing the physical core coordinate.
- */
-CoreCoord PhysicalCoreFromLogical(Device device, const CoreCoord &logical_core, const CoreType &core_type);
-
-/**
- * @brief Retrieves the worker core coordinate corresponding to a logical core.
- *
- * @param device The device to query.
- * @param logical_core The logical core coordinate.
- * @return CoreCoord representing the worker core coordinate.
- */
-CoreCoord WorkerCoreFromLogical(Device device, const CoreCoord &logical_core);
-
-/**
- * @brief Retrieves the Ethernet core coordinate corresponding to a logical core.
- *
- * @param device The device to query.
- * @param logical_core The logical core coordinate.
- * @return CoreCoord representing the Ethernet core coordinate.
- */
-CoreCoord EthernetCoreFromLogical(Device device, const CoreCoord &logical_core);
+CoreCoord GetDramGridSize(DeviceHandle device);
 
 /**
  * @brief Enables the program cache on the device.
  *
  * @param device The device to modify.
  */
-void EnableProgramCache(Device device);
+void EnableProgramCache(DeviceHandle device);
 
 /**
  * @brief Disables and clears the program cache on the device.
  *
  * @param device The device to modify.
  */
-void DisableAndClearProgramCache(Device device);
+void DisableAndClearProgramCache(DeviceHandle device);
 
 /**
  * @brief Pushes a work function onto the device's work queue.
@@ -178,23 +164,14 @@ void DisableAndClearProgramCache(Device device);
  * @param work The work function to execute.
  * @param blocking Indicates whether the operation should be blocking (default: false).
  */
-void PushWork(Device device, std::function<void()> &&work, bool blocking = false);
-
-/**
- * @brief Pushes a shared work function onto the device's work queue.
- *
- * @param device The device to which the work will be pushed.
- * @param work Shared pointer to the work function to execute.
- * @param blocking Indicates whether the operation should be blocking (default: false).
- */
-void PushWork(Device device, std::function<void()> work, bool blocking = false);
+void PushWork(DeviceHandle device, std::function<void()> work, bool blocking = false);
 
 /**
  * @brief Synchronizes operations on the given device.
  *
  * @param device The device to synchronize.
  */
-void Synchronize(Device device);
+void Synchronize(DeviceHandle device);
 
 /**
  * @brief Retrieves a list of Ethernet socket coordinates connected to a specific chip ID.
@@ -203,7 +180,7 @@ void Synchronize(Device device);
  * @param connected_chip_id The connected chip ID.
  * @return Vector of CoreCoord representing Ethernet socket coordinates.
  */
-std::vector<CoreCoord> GetEthernetSockets(Device device, chip_id_t connected_chip_id);
+std::vector<CoreCoord> GetEthernetSockets(DeviceHandle device, chip_id_t connected_chip_id);
 
 /**
  * @brief Returns the number of banks for a specific buffer type on the device.
@@ -212,7 +189,7 @@ std::vector<CoreCoord> GetEthernetSockets(Device device, chip_id_t connected_chi
  * @param buffer_type The type of buffer.
  * @return Number of banks.
  */
-uint32_t GetNumBanks(Device device, const BufferType &buffer_type);
+std::uint32_t GetNumBanks(DeviceHandle device, BufferType buffer_type);
 
 /**
  * @brief Computes the offset of a specific bank for a buffer type on the device.
@@ -222,7 +199,7 @@ uint32_t GetNumBanks(Device device, const BufferType &buffer_type);
  * @param bank_id The ID of the bank.
  * @return Offset of the bank.
  */
-int32_t GetBankOffset(Device device, BufferType buffer_type, uint32_t bank_id);
+std::int32_t GetBankOffset(DeviceHandle device, BufferType buffer_type, std::uint32_t bank_id);
 
 /**
  * @brief Retrieves bank IDs associated with a logical core for a given buffer type.
@@ -230,10 +207,10 @@ int32_t GetBankOffset(Device device, BufferType buffer_type, uint32_t bank_id);
  * @param device The device to query.
  * @param buffer_type The type of buffer.
  * @param logical_core The logical core coordinate.
- * @return Reference to a vector of bank IDs.
+ * @return span of const bank IDs.
  */
-const std::vector<uint32_t> &BankIdsFromLogicalCore(Device device, BufferType buffer_type, const CoreCoord &logical_core);
-
+stl::Span<const std::uint32_t> BankIdsFromLogicalCore(
+    DeviceHandle device, BufferType buffer_type, CoreCoord logical_core);
 
 /**
  * @brief Retrieves the machine epsilon for the SFPU on the device.
@@ -241,7 +218,7 @@ const std::vector<uint32_t> &BankIdsFromLogicalCore(Device device, BufferType bu
  * @param device The device to query.
  * @return SFPU machine epsilon.
  */
-float GetSfpuEps(Device device);
+float GetSfpuEps(DeviceHandle device);
 
 /**
  * @brief Retrieves the representation of NaN for the SFPU on the device.
@@ -249,7 +226,7 @@ float GetSfpuEps(Device device);
  * @param device The device to query.
  * @return SFPU NaN value.
  */
-float GetSfpuNan(Device device);
+float GetSfpuNan(DeviceHandle device);
 
 /**
  * @brief Retrieves the representation of infinity for the SFPU on the device.
@@ -257,24 +234,7 @@ float GetSfpuNan(Device device);
  * @param device The device to query.
  * @return SFPU infinity value.
  */
-float GetSfpuInf(Device device);
-
-/**
- * @brief Retrieves a command queue from the device for a given queue ID.
- *
- * @param device The device to query.
- * @param cq_id The command queue ID.
- * @return CommandQueue handle.
- */
-CommandQueue GetCommandQueue(Device device, size_t cq_id);
-
-/**
- * @brief Retrieves the default command queue for the given device.
- *
- * @param device The device to query.
- * @return CommandQueue handle.
- */
-CommandQueue GetDefaultCommandQueue(Device device);
+float GetSfpuInf(DeviceHandle device);
 
 /**
  * @brief Retrieves the current worker mode of the device.
@@ -282,7 +242,7 @@ CommandQueue GetDefaultCommandQueue(Device device);
  * @param device The device to query.
  * @return WorkExecutorMode representing the current worker mode.
  */
-WorkExecutorMode GetWorkerMode(Device device);
+WorkExecutorMode GetWorkerMode(DeviceHandle device);
 
 /**
  * @brief Retrieves the number of entries in the program cache on the device.
@@ -290,16 +250,7 @@ WorkExecutorMode GetWorkerMode(Device device);
  * @param device The device to query.
  * @return Number of program cache entries.
  */
-std::size_t GetNumProgramCacheEntries(Device device);
-
-/**
- * @brief Checks if the current execution is in the main thread for the device.
- *
- * @param device The device to query.
- * @return True if in the main thread; otherwise, false.
- */
-bool InMainThread(Device device);
-
+std::size_t GetNumProgramCacheEntries(DeviceHandle device);
 
-} // namespace v1
-} // namespace tt::tt_metal
+}  // namespace v1
+}  // namespace tt::tt_metal
diff --git a/tt_metal/include/tt_metal/event.hpp b/tt_metal/include/tt_metal/event.hpp
index f73ac2a9ca3..a4200e2ff83 100644
--- a/tt_metal/include/tt_metal/event.hpp
+++ b/tt_metal/include/tt_metal/event.hpp
@@ -17,41 +17,48 @@ namespace v1 {
  * @brief Enqueues a command to record an event on the device.
  *
  * @param cq The command queue used to dispatch the command.
- * @param event Shared pointer to the Event object to record.
+ * @return Handle to the recorded Event object.
  */
-void EnqueueRecordEvent(CommandQueue cq, const std::shared_ptr<Event> &event);
+EventHandle EnqueueRecordEvent(CommandQueueHandle cq);
 
 /**
  * @brief Enqueues a command to wait for an event to complete on the device.
  *
  * @param cq The command queue that will wait for the event.
- * @param event Shared pointer to the Event object to wait on.
+ * @param event Handle to the Event object to wait on.
  */
-void EnqueueWaitForEvent(CommandQueue cq, const std::shared_ptr<Event> &event);
+void EnqueueWaitForEvent(CommandQueueHandle cq, EventHandle event);
 
 /**
  * @brief Blocks the host until the specified event has completed on the device.
  *
- * @param event Shared pointer to the Event object to synchronize.
+ * @param event Handle to the Event object to synchronize.
  */
-void EventSynchronize(const std::shared_ptr<Event> &event);
+void EventSynchronize(EventHandle event);
 
 /**
  * @brief Queries the completion status of an event on the device.
  *
- * @param event Shared pointer to the Event object to query.
+ * @param event Handle to the Event object to query.
  * @return True if the event is completed; otherwise, false.
  */
-bool EventQuery(const std::shared_ptr<Event> &event);
+bool EventQuery(EventHandle event);
 
 
 /**
  * @brief Synchronizes the device with the host by waiting for all operations to complete.
  *
- * @param device The device to synchronize.
- * @param cq_id Optional command queue ID to synchronize. If not provided, all queues are synchronized.
+ * @param device device to synchronize.
  */
-void Synchronize(Device device, const std::optional<uint8_t> cq_id = std::nullopt);
+void DeviceSynchronize(DeviceHandle device);
+
+
+/**
+ * @brief Synchronizes the command queue with the host by waiting for all operations to complete.
+ *
+ * @param cq command queue to synchronize.
+ */
+void CommandQueueSynchronize(CommandQueueHandle cq);
 
 
 } // namespace v1
diff --git a/tt_metal/include/tt_metal/kernel.hpp b/tt_metal/include/tt_metal/kernel.hpp
index 332e1cd6c13..4f6417ff70c 100644
--- a/tt_metal/include/tt_metal/kernel.hpp
+++ b/tt_metal/include/tt_metal/kernel.hpp
@@ -5,13 +5,12 @@
 #pragma once
 
 #include "types.hpp"
-#include "tt_metal/impl/kernels/kernel_types.hpp"
 
 //==================================================
 //                 KERNEL EXECUTION
 //==================================================
 
-namespace tt::tt_metal{
+namespace tt::tt_metal {
 namespace v1 {
 
 /**
@@ -23,44 +22,26 @@ namespace v1 {
  * @param runtime_args The runtime arguments to be set.
  */
 void SetRuntimeArgs(
-    const Program program,
-    KernelHandle kernel,
-    const CoreRangeSet &core_spec,
-    const RuntimeArgs &runtime_args);
-
-/**
- * @brief Sets multiple runtime arguments of a kernel at once.
- *
- * @param program The program containing the kernel.
- * @param kernel KernelHandle representing the kernel ID.
- * @param core_spec Vector of core coordinates where the runtime arguments will be set.
- * @param runtime_args The runtime arguments to be set.
- */
-void SetRuntimeArgs(
-    const Program program,
-    KernelHandle kernel,
-    const std::vector<CoreCoord> &core_spec,
-    const RuntimeArgs &runtime_args);
+    ProgramHandle &program, KernelHandle kernel, const CoreRangeSet &core_spec, RuntimeArgs runtime_args);
 
 /**
  * @brief Sets common runtime arguments for a kernel, shared by all cores.
  *
  * @param program The program containing the kernel.
- * @param kernel_id KernelHandle representing the kernel ID.
+ * @param kernel KernelHandle representing the kernel ID.
  * @param runtime_args The runtime arguments to be set.
  */
-void SetCommonRuntimeArgs(const Program program, KernelHandle kernel_id, const RuntimeArgs &runtime_args);
+void SetCommonRuntimeArgs(ProgramHandle &program, KernelHandle kernel, RuntimeArgs runtime_args);
 
 /**
  * @brief Gets the runtime arguments for a kernel.
  *
  * @param program The program containing the kernel.
- * @param kernel_id KernelHandle representing the kernel ID.
+ * @param kernel KernelHandle representing the kernel ID.
  * @param logical_core The logical core coordinate.
- * @return Reference to RuntimeArgsData.
+ * @return Span of runtime arguments.
  */
-RuntimeArgsData &GetRuntimeArgs(const Program program, KernelHandle kernel_id, const CoreCoord &logical_core);
-
+RuntimeArgs GetRuntimeArgs(ProgramHandle &program, KernelHandle kernel, CoreCoord logical_core);
 
-} // namespace v1
-} // namespace tt::tt_metal
+}  // namespace v1
+}  // namespace tt::tt_metal
diff --git a/tt_metal/include/tt_metal/trace.hpp b/tt_metal/include/tt_metal/trace.hpp
index 5131d8a978c..fb84ee296e1 100644
--- a/tt_metal/include/tt_metal/trace.hpp
+++ b/tt_metal/include/tt_metal/trace.hpp
@@ -13,50 +13,51 @@ namespace v1 {
 //                  TRACE OPERATIONS
 //==================================================
 
+/**
+ * @brief Obtains the associated command queue from a given trace handle.
+ *
+ * @param trace The trace handle returned by BeginTraceCapture.
+ * @return Command queue handle.
+ */
+CommandQueueHandle GetCommandQueue(TraceHandle trace);
+
 /**
  * @brief Begins capture on a trace. Captured commands will have their execution delayed until the trace is replayed.
  *
- * @param device The device being traced.
  * @param cq The command queue associated with the trace.
- * @return Trace ID.
+ * @return Trace handle.
  */
-uint32_t BeginTraceCapture(Device device, CommandQueue cq);
+TraceHandle BeginTraceCapture(CommandQueueHandle cq);
 
 /**
  * @brief Ends capture on a trace. The trace can be replayed on the same device command queue.
  *
- * @param device The device being traced.
- * @param cq The command queue associated with the trace.
- * @param tid The trace ID returned by BeginTraceCapture.
+ * @param trace The trace handle returned by BeginTraceCapture.
  */
-void EndTraceCapture(Device device, CommandQueue cq, uint32_t tid);
+void EndTraceCapture(TraceHandle trace);
 
 /**
  * @brief Replays a captured trace on the device.
  *
- * @param device The device holding the trace.
- * @param cq The command queue associated with the trace.
- * @param trace The trace ID to replay.
+ * @param trace The trace to replay.
  * @param blocking Indicates whether the operation is blocking.
  */
-void ReplayTrace(Device device, CommandQueue cq, Trace trace, bool blocking);
+void ReplayTrace(TraceHandle trace, bool blocking);
 
 /**
  * @brief Releases a previously captured trace, deallocating associated resources.
  *
- * @param device The device holding the trace.
  * @param trace The trace to release.
  */
-void ReleaseTrace(Device device, Trace trace);
+void ReleaseTrace(TraceHandle trace);
 
 /**
  * @brief Enqueues a trace for execution on the device.
  *
- * @param cq The command queue used to dispatch the trace.
  * @param trace The Trace to enqueue.
  * @param blocking Indicates whether the operation is blocking.
  */
-void EnqueueTrace(CommandQueue cq, Trace trace, bool blocking);
+void EnqueueTrace(TraceHandle trace, bool blocking);
 
 
 } // namespace v1
diff --git a/tt_metal/include/tt_metal/types.hpp b/tt_metal/include/tt_metal/types.hpp
index 2e6e78a5878..4252f764cbb 100644
--- a/tt_metal/include/tt_metal/types.hpp
+++ b/tt_metal/include/tt_metal/types.hpp
@@ -4,28 +4,49 @@
 
 #pragma once
 
-#include "device/tt_cluster_descriptor_types.h"
-#include "hostdevcommon/common_values.hpp"
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/impl/device/device_handle.hpp"
-#include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
 
 namespace tt::tt_metal {
 namespace v1 {
 
 using ProgramHandle = v0::Program;
 class DeviceHandle;
-class CommandQueue;
-class Trace;
+
+class CommandQueueHandle {
+   private:
+    explicit constexpr CommandQueueHandle(DeviceHandle device, std::uint8_t id = 0) : device{device}, id{id} {}
+
+    DeviceHandle device;
+    std::uint8_t id;
+
+    friend CommandQueueHandle GetCommandQueue(DeviceHandle device, std::uint8_t id);
+    friend DeviceHandle GetDevice(CommandQueueHandle cq);
+    friend std::uint8_t GetId(CommandQueueHandle cq);
+};
+
+class TraceHandle {
+   public:
+    explicit constexpr operator std::uint32_t() const noexcept { return id; }
+
+   private:
+    explicit constexpr TraceHandle(CommandQueueHandle cq, std::uint32_t id) noexcept : cq{cq}, id{id} {}
+
+    CommandQueueHandle cq;
+    std::uint32_t id;
+
+    friend TraceHandle BeginTraceCapture(CommandQueueHandle cq);
+    friend CommandQueueHandle GetCommandQueue(TraceHandle trace);
+};
 
 class KernelHandle {
    public:
-    explicit constexpr KernelHandle(tt_metal::KernelHandle kernel_id) noexcept : kernel_id{kernel_id} {}
+    explicit constexpr KernelHandle(tt_metal::KernelHandle id) noexcept : id{id} {}
 
-    explicit constexpr operator tt_metal::KernelHandle() const noexcept { return kernel_id; }
+    explicit constexpr operator tt_metal::KernelHandle() const noexcept { return id; }
 
    private:
-    tt_metal::KernelHandle kernel_id;
+    tt_metal::KernelHandle id;
 };
 
 class CircularBufferHandle {
@@ -53,11 +74,23 @@ class BufferHandle {
     std::shared_ptr<v0::Buffer> buffer_ptr;
 };
 
-// Not likely going to be opaque, but pending review of
-// completion of the prototype of the runtime args.
-class Event;
-class RuntimeArgs;
-class RuntimeArgsData;
+class EventHandle {
+   public:
+    explicit EventHandle();
+    explicit EventHandle(const std::shared_ptr<v0::Event> &event_ptr) noexcept : event_ptr{event_ptr} {}
+    explicit EventHandle(std::shared_ptr<v0::Event> &&event_ptr) noexcept :
+        event_ptr{static_cast<std::shared_ptr<v0::Event> &&>(event_ptr)} {}
+
+    explicit operator const std::shared_ptr<v0::Event> &() const noexcept { return event_ptr; }
+
+    v0::Event &operator*() const noexcept { return *event_ptr.get(); }
+    v0::Event *operator->() const noexcept { return event_ptr.get(); }
+
+   private:
+    std::shared_ptr<v0::Event> event_ptr;
+};
+
+using RuntimeArgs = tt::stl::Span<const std::uint32_t>;
 
 }  // namespace v1
 }  // namespace tt::tt_metal
diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp
index fcf06d594c0..d0984d7d048 100644
--- a/tt_metal/jit_build/build.cpp
+++ b/tt_metal/jit_build/build.cpp
@@ -73,6 +73,7 @@ void JitBuildEnv::init(uint32_t build_key, tt::ARCH arch, const std::map<std::st
     this->cflags_ +=
         "-fno-use-cxa-atexit -fno-exceptions "
         "-Wall -Werror -Wno-unknown-pragmas "
+        "-Wno-deprecated-declarations "
         "-Wno-error=multistatement-macros -Wno-error=parentheses "
         "-Wno-error=unused-but-set-variable -Wno-unused-variable "
         "-Wno-unused-function ";
diff --git a/tt_metal/jit_build/data_format.cpp b/tt_metal/jit_build/data_format.cpp
index 2bf3fa9fd0c..31d12b37f7a 100644
--- a/tt_metal/jit_build/data_format.cpp
+++ b/tt_metal/jit_build/data_format.cpp
@@ -74,15 +74,15 @@ ExpPrecision get_exp_precison(DataFormat data_format) {
     return (is_exp_b_format(data_format) ? ExpPrecision::B : ExpPrecision::A);
 }
 
-void dump_data_formats(DataFormat data_format[NUM_OPERANDS]) {
-    for (int i = 0; i < NUM_OPERANDS; i++) {
+void dump_data_formats(DataFormat data_format[NUM_CIRCULAR_BUFFERS]) {
+    for (int i = 0; i < NUM_CIRCULAR_BUFFERS; i++) {
         std::cout << "Operand idx " << i << ": " << data_format[i] << "," <<std::endl;
     }
 }
 
-DataFormat check_consistent_format_within_operand(DataFormat data_format[NUM_OPERANDS]) {
+DataFormat check_consistent_format_across_buffers(DataFormat data_format[NUM_CIRCULAR_BUFFERS]) {
     DataFormat last_valid_format = DataFormat::Invalid;
-    for (int i = 0; i < NUM_OPERANDS; i++) {
+    for (int i = 0; i < NUM_CIRCULAR_BUFFERS; i++) {
         // Special case where Float32 can pair with any exponent precision, skip checking
         if ((data_format[i] == DataFormat::Float32) || (data_format[i] == DataFormat::RawUInt32) ||
             (data_format[i] == DataFormat::UInt32) || (data_format[i] == DataFormat::RawUInt16) ||
@@ -109,24 +109,9 @@ DataFormat check_consistent_format_within_operand(DataFormat data_format[NUM_OPE
     return last_valid_format;
 }
 
-DataFormat check_same_format_within_operand(DataFormat data_format[NUM_OPERANDS]) {
+DataFormat check_valid_formats_in_out_data_formats(DataFormat data_format[NUM_CIRCULAR_BUFFERS]) {
     DataFormat last_valid_format = DataFormat::Invalid;
-    for (int i = 0; i < NUM_OPERANDS; i++) {
-        if (data_format[i] != DataFormat::Invalid && last_valid_format != DataFormat::Invalid) {
-            // TT_FATAL(data_format[i] == last_valid_format,
-            //     "Not all buffer data-formats within this operand are the same");
-
-            // dump_data_formats(data_format);
-        } else if (data_format[i] != DataFormat::Invalid && last_valid_format == DataFormat::Invalid) {
-            last_valid_format = data_format[i];
-        }
-    }
-    return last_valid_format;
-}
-
-DataFormat check_valid_formats_within_operand(DataFormat data_format[NUM_OPERANDS]) {
-    DataFormat last_valid_format = DataFormat::Invalid;
-    for (int i = 0; i < NUM_OPERANDS; i++) {
+    for (int i = 0; i < NUM_CIRCULAR_BUFFERS; i++) {
         if (data_format[i] != DataFormat::Invalid) {
             TT_FATAL(ALL_VALID_FORMATS.find(data_format[i]) != ALL_VALID_FORMATS.end(),
                 "Format = {} not supported", data_format[i]);
@@ -136,68 +121,16 @@ DataFormat check_valid_formats_within_operand(DataFormat data_format[NUM_OPERAND
     return last_valid_format;
 }
 
-// Checks consistency between input operand data-formats.
-// Data-formats for all input operands must have the same -b- exponent precision type.
-void check_consistent_format_across_input_operands(DataFormat input_format[NUM_OPERANDS], DataFormat param_format[NUM_OPERANDS]) {
-
-    DataFormat last_input_valid_format = check_consistent_format_within_operand(input_format);
-    DataFormat last_param_valid_format = check_consistent_format_within_operand(param_format);
-    if (last_input_valid_format != DataFormat::Invalid && last_param_valid_format != DataFormat::Invalid) {
-        TT_FATAL(is_exp_b_format(last_input_valid_format) == is_exp_b_format(last_param_valid_format),
-            "Formats don't have same exponent width");
-    }
-}
-
-DataFormat get_pack_data_format(DataFormat output_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS]) {
-    DataFormat output_format = check_same_format_within_operand(output_formats);
-    if (output_format == DataFormat::Invalid) {
-        DataFormat intermed_format = check_same_format_within_operand(intermed_formats);
-        return intermed_format;
-    } else {
-        return output_format;
-    }
-}
-
-ExpPrecision get_data_exp_precision(DataFormat data_formats[NUM_OPERANDS]) {
-    DataFormat last_valid_format = check_consistent_format_within_operand(data_formats);
-    return get_exp_precison(last_valid_format);
-}
-
-ExpPrecision get_input_data_exp_precision(DataFormat input_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS]) {
-    DataFormat last_valid_format = check_consistent_format_within_operand(input_formats);
-    if (last_valid_format == DataFormat::Invalid) {
-        last_valid_format = check_consistent_format_within_operand(intermed_formats);
-    }
-
+ExpPrecision get_data_exp_precision(DataFormat data_formats[NUM_CIRCULAR_BUFFERS]) {
+    DataFormat last_valid_format = check_consistent_format_across_buffers(data_formats);
     return get_exp_precison(last_valid_format);
 }
 
-void check_valid_in_out_data_formats(DataFormat input_formats[NUM_OPERANDS], DataFormat output_formats[NUM_OPERANDS], DataFormat param_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS]) {
 
-    //inputs must have same exp_width, save last found formats with exp width
-    DataFormat last_valid_input_format = check_consistent_format_within_operand(input_formats);
-    DataFormat last_valid_param_format = check_consistent_format_within_operand(param_formats);
-    if (last_valid_input_format != DataFormat::Invalid && last_valid_param_format != DataFormat::Invalid) {
-        TT_FATAL((is_exp_b_format(last_valid_input_format) == is_exp_b_format(last_valid_param_format)),
-            "Input format = {} and Param format = {} must have same exp width", last_valid_input_format, last_valid_param_format);
-    }
-
-    //If intermediate buffers are used, check they have same exp width as inputs
-    DataFormat last_valid_intermed_format = check_consistent_format_within_operand(intermed_formats);
-    if (last_valid_input_format != DataFormat::Invalid && last_valid_intermed_format != DataFormat::Invalid) {
-        TT_FATAL(is_exp_b_format(last_valid_input_format) == is_exp_b_format(last_valid_intermed_format),
-            "Input format = {} and Intermed format = {} must have same exp width", last_valid_input_format, last_valid_intermed_format);
-    }
-
-    // MT: Why is this check not enabled?
-    // DataFormat last_out_valid_format = check_valid_formats_within_operand(output_formats);
-    // TT_FATAL((last_out_valid_format != DataFormat::Invalid), "Output format not found");
-}
-
-std::vector<DataFormat> get_unpack_src_formats(DataFormat input_formats[NUM_OPERANDS], DataFormat param_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS]) {
+std::vector<DataFormat> get_unpack_src_formats(DataFormat data_formats[NUM_CIRCULAR_BUFFERS]) {
     std::vector<DataFormat> unpack_src_format;
-    for (int i=0 ; i<NUM_OPERANDS ; i++) {
-        DataFormat src_format = input_formats[i];
+    for (int i=0 ; i<NUM_CIRCULAR_BUFFERS ; i++) {
+        DataFormat src_format = data_formats[i];
         if (src_format == DataFormat::RawUInt32 || src_format == DataFormat::RawUInt16 || src_format == DataFormat::RawUInt8) {
             switch (src_format) {
                case DataFormat::RawUInt32: src_format = DataFormat::Float32; break;
@@ -207,14 +140,6 @@ std::vector<DataFormat> get_unpack_src_formats(DataFormat input_formats[NUM_OPER
         }
         unpack_src_format.push_back(src_format);
     }
-    for (int i=0 ; i<NUM_OPERANDS ; i++) {
-        DataFormat src_format = param_formats[i];
-        unpack_src_format.push_back(src_format);
-    }
-    for (int i=0 ; i<NUM_OPERANDS ; i++) {
-        DataFormat src_format = intermed_formats[i];
-        unpack_src_format.push_back(src_format);
-    }
     return unpack_src_format;
 }
 
@@ -232,8 +157,8 @@ const DataFormat get_single_unpack_dst_format(const DataFormat src_format, const
     return dst_format;
 }
 
-bool is_all_fp32_formats(const DataFormat data_format[NUM_OPERANDS]) {
-    for (int i = 0; i < NUM_OPERANDS; i++) {
+bool is_all_fp32_formats(const DataFormat data_format[NUM_CIRCULAR_BUFFERS]) {
+    for (int i = 0; i < NUM_CIRCULAR_BUFFERS; i++) {
         if (data_format[i] != DataFormat::Invalid && data_format[i] != DataFormat::Float32) {
             return false;
         }
@@ -242,10 +167,7 @@ bool is_all_fp32_formats(const DataFormat data_format[NUM_OPERANDS]) {
 }
 
 std::vector<DataFormat> get_unpack_dst_formats(
-    DataFormat input_formats[NUM_OPERANDS],
-    DataFormat param_formats[NUM_OPERANDS],
-    DataFormat intermed_formats[NUM_OPERANDS],
-    DataFormat output_formats[NUM_OPERANDS],
+    DataFormat buf_formats[NUM_CIRCULAR_BUFFERS],
     DataFormat unpack_conditional_dst_format,
     bool fp32_dest_acc_en,
     std::vector<UnpackToDestMode> unpack_to_dest_mode,
@@ -255,15 +177,10 @@ std::vector<DataFormat> get_unpack_dst_formats(
         TT_FATAL(unpack_to_dest_mode.size() == NUM_CIRCULAR_BUFFERS, "unpack_to_dest_mode vector must have 32 elements");
     }
 
-    DataFormat pack_format = get_pack_data_format(output_formats, intermed_formats);
-    ExpPrecision input_precision = get_data_exp_precision(input_formats);
-
     std::vector<DataFormat> unpack_dst_format;
 
-    const bool en_unpack_tf32 = fp32_dest_acc_en && (tt::is_all_fp32_formats(input_formats) || (input_precision == ExpPrecision::B));
-    DataFormat unpack_cond_dst_format = en_unpack_tf32 ? DataFormat::Tf32 : unpack_conditional_dst_format;
-    for (int i=0 ; i<NUM_OPERANDS ; i++) {
-        DataFormat src_format = input_formats[i];
+    for (int i=0 ; i<NUM_CIRCULAR_BUFFERS ; i++) {
+        DataFormat src_format = buf_formats[i];
         if (src_format == DataFormat::RawUInt32 || src_format == DataFormat::RawUInt16 || src_format == DataFormat::RawUInt8) {
             switch (src_format) {
                case DataFormat::RawUInt32: src_format = DataFormat::Float32; break;
@@ -274,131 +191,117 @@ std::vector<DataFormat> get_unpack_dst_formats(
         } else if (int_fpu_en) {
             unpack_dst_format.push_back(src_format);
         } else {
-            if (input_formats[i] == DataFormat::Float32 && !unpack_to_dest_mode.empty() && unpack_to_dest_mode[i] != UnpackToDestMode::Default) {
-                unpack_dst_format.push_back(get_single_unpack_dst_format(input_formats[i], pack_format, DataFormat::Float32));
+            if (buf_formats[i] == DataFormat::Float32 && !unpack_to_dest_mode.empty() && unpack_to_dest_mode[i] != UnpackToDestMode::Default) {
+                unpack_dst_format.push_back(get_single_unpack_dst_format(src_format, DataFormat::Invalid, DataFormat::Float32));
             } else {
-                unpack_dst_format.push_back(get_single_unpack_dst_format(input_formats[i], pack_format, unpack_cond_dst_format));
+                unpack_dst_format.push_back(get_single_unpack_dst_format(src_format, DataFormat::Invalid, unpack_conditional_dst_format));
             }
-        }
-    }
-    for (int i=0 ; i<NUM_OPERANDS ; i++) {
-        if (param_formats[i] == DataFormat::Float32 && !unpack_to_dest_mode.empty() && unpack_to_dest_mode[NUM_OPERANDS+i] != UnpackToDestMode::Default) {
-            unpack_dst_format.push_back(get_single_unpack_dst_format(param_formats[i], pack_format, DataFormat::Float32));
-        } else {
-            unpack_dst_format.push_back(get_single_unpack_dst_format(param_formats[i], pack_format, unpack_cond_dst_format));
-        }
-    }
-    for (int i=0 ; i<NUM_OPERANDS ; i++) {
-        if (intermed_formats[i] == DataFormat::Float32 && !unpack_to_dest_mode.empty() && unpack_to_dest_mode[3*NUM_OPERANDS+i] != UnpackToDestMode::Default) {
-            unpack_dst_format.push_back(get_single_unpack_dst_format(intermed_formats[i], pack_format, DataFormat::Float32));
-        } else {
-            unpack_dst_format.push_back(get_single_unpack_dst_format(intermed_formats[i], pack_format, unpack_cond_dst_format));
+
         }
     }
     return unpack_dst_format;
 }
 
 const DataFormat get_single_pack_src_format(
-    DataFormat input_format,
-    DataFormat output_format,
+    DataFormat data_format,
     DataFormat unpack_conditional_dst_format,
     bool fp32_dest_acc_en,
     bool bfp8_pack_precise,
     bool int_fpu_en,
     tt::ARCH arch) {
 
-    if(input_format == DataFormat::Fp8_e4m3) {
+    if(data_format == DataFormat::Fp8_e4m3) {
         TT_FATAL(arch == tt::ARCH::BLACKHOLE, "Fp8 E4M3 mode only available in Blackhole");
     }
 
     DataFormat pack_src_format;
-    const ExpPrecision input_exp_width = get_exp_precison(input_format);
-    const ExpPrecision output_exp_width = get_exp_precison(output_format);
+    const ExpPrecision input_exp_width = get_exp_precison(data_format);
+    const ExpPrecision output_exp_width = get_exp_precison(data_format);
     const ExpPrecision fp32_condition_exp_width = get_exp_precison(unpack_conditional_dst_format);
 
-    bool is_input_or_output_float32 = input_format == DataFormat::Float32 || output_format == DataFormat::Float32;
+    bool is_input_or_output_float32 = data_format == DataFormat::Float32;
     bool condition_exp_float32_match_output = is_input_or_output_float32 && fp32_condition_exp_width == output_exp_width;
 
-    if (input_format == DataFormat::RawUInt32 || input_format == DataFormat::RawUInt16 || input_format == DataFormat::RawUInt8) {
-        switch (input_format) {
+    if (data_format == DataFormat::RawUInt32 || data_format == DataFormat::RawUInt16 || data_format == DataFormat::RawUInt8) {
+        switch (data_format) {
             case DataFormat::RawUInt32: pack_src_format = DataFormat::Float32; break;
             case DataFormat::RawUInt16: pack_src_format = DataFormat::Float16; break;
             default: pack_src_format = DataFormat::Lf8; break;
         }
-    } else if (input_format == DataFormat::UInt16) {
-        pack_src_format = output_format;
-    } else if ((input_format == DataFormat::Invalid) || (output_format == DataFormat::Invalid)) {
+    } else if (data_format == DataFormat::UInt16) {
+        pack_src_format = data_format;
+    } else if (data_format == DataFormat::Invalid) {
         pack_src_format =  DataFormat::Invalid;
-    } else if (input_format == DataFormat::Fp8_e4m3) {
+    } else if (data_format == DataFormat::Fp8_e4m3) {
         pack_src_format =  DataFormat::Float16;
     } else if (fp32_dest_acc_en) {
         TT_FATAL(arch != tt::ARCH::GRAYSKULL, "Dest Fp32 mode is not supported for arch grayskull");
 
-        if (is_bfp_format(output_format)) {
-            pack_src_format = bfp8_pack_precise ? DataFormat::Float32 : (is_exp_b_format(output_format) ? DataFormat::Bfp8_b : DataFormat::Bfp8);
-        } else if(is_exp_b_format(output_format) || (output_format == DataFormat::Float32)) {
-            pack_src_format = output_format;
-        } else if(output_format == DataFormat::Float16){
+        if (is_bfp_format(data_format)) {
+            pack_src_format = bfp8_pack_precise ? DataFormat::Float32 : (is_exp_b_format(data_format) ? DataFormat::Bfp8_b : DataFormat::Bfp8);
+        } else if(is_exp_b_format(data_format) || (data_format == DataFormat::Float32)) {
+            pack_src_format = data_format;
+        } else if(data_format == DataFormat::Float16){
             pack_src_format = DataFormat::Float16_b;
-        } else if(output_format == DataFormat::UInt32){
+        } else if(data_format == DataFormat::UInt32){
             pack_src_format = DataFormat::UInt32;
-        } else if(output_format == DataFormat::Int32){
+        } else if(data_format == DataFormat::Int32){
             pack_src_format = DataFormat::Int32;
-        } else if(output_format == DataFormat::UInt16){
+        } else if(data_format == DataFormat::UInt16){
             pack_src_format = DataFormat::UInt16;
-        } else if(output_format == DataFormat::UInt8){
+        } else if(data_format == DataFormat::UInt8){
             pack_src_format = DataFormat::UInt8;
         } else {
-            TT_THROW("No valid conversion from fp32 dest to output format = {}", output_format);
+            TT_THROW("No valid conversion from fp32 dest to output format = {}", data_format);
         }
     } else if (int_fpu_en) {
         TT_THROW("Integer math is not supported");
         // TT_FATAL(arch != tt::ARCH::GRAYSKULL, "Integer math is not supported for arch grayskull");
         // If output is integer, then pack_src_format is integer as conversion in packer is not supported
         // If output if float, then pack_src_format is Float32 as sfpu outut if Float32
-        if (tt::is_integer_format(output_format)) {
-            pack_src_format = output_format;
+        if (tt::is_integer_format(data_format)) {
+            pack_src_format = data_format;
         } else {
             pack_src_format = DataFormat::Float32;
         }
-    } else if (tt::is_integer_format(output_format)) {
-        pack_src_format = output_format;
-    } else if ( (!is_input_or_output_float32 && input_exp_width == output_exp_width ) || condition_exp_float32_match_output || output_format == DataFormat::Float32) {
+    } else if (tt::is_integer_format(data_format)) {
+        pack_src_format = data_format;
+    } else if ( (!is_input_or_output_float32 && input_exp_width == output_exp_width ) || condition_exp_float32_match_output || data_format == DataFormat::Float32) {
         if (is_input_or_output_float32) {
             //Assert that pack_src_format has same exp width as input format
             TT_FATAL((unpack_conditional_dst_format == DataFormat::Float16_b || unpack_conditional_dst_format == DataFormat::Float16),
                         "fp32 conditional format can only be fp16a/b");
 
-            if(input_format != DataFormat::Float32) {
+            if(data_format != DataFormat::Float32) {
                 TT_FATAL((input_exp_width == fp32_condition_exp_width),
-                    "Input format exponent width = {}, must match pack src format exponent width = {}", input_format, unpack_conditional_dst_format);
+                    "Input format exponent width = {}, must match pack src format exponent width = {}", data_format, unpack_conditional_dst_format);
             }
             pack_src_format = unpack_conditional_dst_format;
-        } else if (is_bfp_format(output_format)) {
-            pack_src_format = bfp8_pack_precise ? (is_exp_b_format(output_format) ? DataFormat::Float16_b : DataFormat::Float16) : (is_exp_b_format(output_format) ? DataFormat::Bfp8_b : DataFormat::Bfp8);
+        } else if (is_bfp_format(data_format)) {
+            pack_src_format = bfp8_pack_precise ? (is_exp_b_format(data_format) ? DataFormat::Float16_b : DataFormat::Float16) : (is_exp_b_format(data_format) ? DataFormat::Bfp8_b : DataFormat::Bfp8);
         } else {
-            pack_src_format = output_format;
+            pack_src_format = data_format;
         }
     } else {
         //Inputs and outputs are different exponent widths, gs/wha0 only support this mode for fp16
         if(arch != tt::ARCH::WORMHOLE_B0 && arch != tt::ARCH::BLACKHOLE) {
-            TT_FATAL((output_format == DataFormat::Float16_b) || (output_format == DataFormat::Float16),
+            TT_FATAL((data_format == DataFormat::Float16_b) || (data_format == DataFormat::Float16),
                 "Exponent width conversion is only supported for float16 formats for grayskull/wormhole_a0");
         }
 
         //Pack_src_format is the same data format as output data format, but with same exponent width as input data format
         //A/B format mixing only occurs at packer level
-        DataFormat pack_src_format_tmp = output_format;
+        DataFormat pack_src_format_tmp = data_format;
 
-        if (is_bfp_format(output_format)) {
-            pack_src_format_tmp = bfp8_pack_precise ? (is_exp_b_format(output_format) ? DataFormat::Float16_b : DataFormat::Float16) : (is_exp_b_format(output_format) ? DataFormat::Bfp8_b : DataFormat::Bfp8);
+        if (is_bfp_format(data_format)) {
+            pack_src_format_tmp = bfp8_pack_precise ? (is_exp_b_format(data_format) ? DataFormat::Float16_b : DataFormat::Float16) : (is_exp_b_format(data_format) ? DataFormat::Bfp8_b : DataFormat::Bfp8);
         }
 
         if (pack_src_format_tmp != DataFormat::Float32) {
             pack_src_format = CONVERT_EXP_WIDTH.at(pack_src_format_tmp);
-            if (input_format != DataFormat::Float32) {
+            if (data_format != DataFormat::Float32) {
                 TT_FATAL(input_exp_width == get_exp_precison(pack_src_format),
-                    "Input format exponent width = {}, must match pack src format exponent width = {}", input_format, pack_src_format);
+                    "Input format exponent width = {}, must match pack src format exponent width = {}", data_format, pack_src_format);
             }
         } else {
             pack_src_format = pack_src_format_tmp;
@@ -408,49 +311,35 @@ const DataFormat get_single_pack_src_format(
 }
 
 std::vector<DataFormat> get_pack_src_formats(
-    DataFormat input_formats[NUM_OPERANDS],
-    DataFormat param_formats[NUM_OPERANDS],
-    DataFormat intermed_formats[NUM_OPERANDS],
-    DataFormat output_formats[NUM_OPERANDS],
+    DataFormat data_formats[NUM_CIRCULAR_BUFFERS],
     DataFormat unpack_conditional_dst_format,
     bool fp32_dest_acc_en,
     bool bfp8_pack_precise,
     bool int_fpu_en,
     tt::ARCH arch
 ) {
-    DataFormat pack_output_format = get_pack_data_format(output_formats, intermed_formats);
-
     std::vector<DataFormat> pack_src_formats;
     DataFormat pack_src_format;
-    for (int i = 0; i < NUM_OPERANDS; i++) {
-        pack_src_format = get_single_pack_src_format(input_formats[i], pack_output_format, unpack_conditional_dst_format, fp32_dest_acc_en, bfp8_pack_precise, int_fpu_en, arch);
+    for (int i = 0; i < NUM_CIRCULAR_BUFFERS; i++) {
+        pack_src_format = get_single_pack_src_format(data_formats[i], unpack_conditional_dst_format, fp32_dest_acc_en, bfp8_pack_precise, int_fpu_en, arch);
         pack_src_formats.push_back(pack_src_format);
     }
 
-    // Intermediates
-    for (int i = 0; i < NUM_OPERANDS; i++) {
-        //Intermediates can be inputs & outputs to same op, provide same format per operand id
-        pack_src_format = get_single_pack_src_format(intermed_formats[i], intermed_formats[i], unpack_conditional_dst_format, fp32_dest_acc_en, bfp8_pack_precise, int_fpu_en, arch);
-        pack_src_formats.push_back(pack_src_format);
-    }
     return pack_src_formats;
 }
 
-std::vector<DataFormat> get_pack_dst_formats(DataFormat input_formats[NUM_OPERANDS], DataFormat param_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS], DataFormat output_formats[NUM_OPERANDS]) {
-    DataFormat pack_format = get_pack_data_format(output_formats, intermed_formats);
-
+std::vector<DataFormat> get_pack_dst_formats(DataFormat buf_formats[NUM_CIRCULAR_BUFFERS]) {
     std::vector<DataFormat> pack_dst_format;
-    for (int i = 0; i < NUM_OPERANDS; i++) {
-        if (i == 0) {
-            pack_dst_format.push_back(pack_format);
-        } else {
-            pack_dst_format.push_back(output_formats[i]);
+    for (int i = 0; i < NUM_CIRCULAR_BUFFERS; i++) {
+        DataFormat dst_format = buf_formats[i];
+        if (dst_format == DataFormat::RawUInt32 || dst_format == DataFormat::RawUInt16 || dst_format == DataFormat::RawUInt8) {
+            switch (dst_format) {
+               case DataFormat::RawUInt32: dst_format = DataFormat::Float32; break;
+               case DataFormat::RawUInt16: dst_format = DataFormat::Float16; break;
+               default: dst_format = DataFormat::Lf8; break;
+            }
         }
-    }
-
-    // Intermediates
-    for (int i = 0; i < NUM_OPERANDS; i++) {
-        pack_dst_format.push_back(intermed_formats[i]);
+        pack_dst_format.push_back(dst_format);
     }
     return pack_dst_format;
 }
diff --git a/tt_metal/jit_build/data_format.hpp b/tt_metal/jit_build/data_format.hpp
index ed1e718c61b..0a7cd702a7e 100644
--- a/tt_metal/jit_build/data_format.hpp
+++ b/tt_metal/jit_build/data_format.hpp
@@ -7,12 +7,11 @@
 #include <vector>
 #include "common/tt_backend_api_types.hpp"  // for DataFormat
 #include "device/tt_arch_types.h"           // for ARCH
+#include "circular_buffer.h"         // for NUM_CIRCULAR_BUFFERS
 enum class UnpackToDestMode : std::uint8_t;
 
 namespace tt {
 
-static constexpr uint NUM_OPERANDS = 8;
-
 enum class ExpPrecision : std::uint8_t
 {
   A = 0,
@@ -22,47 +21,33 @@ enum class ExpPrecision : std::uint8_t
 bool is_valid_conversion(DataFormat input_format, DataFormat output_format);
 bool is_exp_b_format(DataFormat data_format);
 ExpPrecision get_exp_precison(DataFormat data_format);
-void dump_data_formats(DataFormat data_format[NUM_OPERANDS]);
+void dump_data_formats(DataFormat data_format[NUM_CIRCULAR_BUFFERS]);
 
 /*
  * Checks operand data formats for same exponent width format
  * Returns the last valid data-format between operand buffers.
  */
-DataFormat check_consistent_format_within_operand(DataFormat data_format[NUM_OPERANDS]);
+DataFormat check_consistent_format_across_buffers(DataFormat data_format[NUM_CIRCULAR_BUFFERS]);
 
 /*
  * Checks operand data formats for same data format
  * Returns the last valid data-format in operand buffers.
  */
-DataFormat check_same_format_within_operand(DataFormat data_format[NUM_OPERANDS]);
+DataFormat check_same_format_across_buffers(DataFormat data_format[NUM_CIRCULAR_BUFFERS]);
 
-/*
- * Checks consistency between input operand data-formats.
- * Data-formats for all input operands must have the same exponent width precision type.
- */
-void check_consistent_format_across_input_operands(DataFormat input_format[NUM_OPERANDS], DataFormat param_format[NUM_OPERANDS]);
-DataFormat check_valid_formats_within_operand(DataFormat data_format[NUM_OPERANDS]);
-DataFormat get_pack_data_format(DataFormat output_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS]);
-ExpPrecision get_data_exp_precision(DataFormat data_formats[NUM_OPERANDS]);
-ExpPrecision get_input_data_exp_precision(DataFormat input_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS]);
+
+DataFormat check_valid_formats_in_out_data_formats(DataFormat data_format[NUM_CIRCULAR_BUFFERS]);
+ExpPrecision get_data_exp_precision(DataFormat data_formats[NUM_CIRCULAR_BUFFERS]);
 
 
 // Checks if all formats in format array are fp32/tf32/invalid, then data can be unpacked as tf32 for fp32 accumulation
-bool is_all_fp32_formats(const DataFormat data_format[NUM_OPERANDS]);
+bool is_all_fp32_formats(const DataFormat data_format[NUM_CIRCULAR_BUFFERS]);
 
-/*
-*This pass checks
-*      1- The input buffers must all have the same exponent precision.
-*      2- Intermediate buffers must also have same exponent precision as inputs.
-*      3- Output buffers can have different exponent width formats
-*      4- Check all buffers have valid supported formats
-*/
-void check_valid_in_out_data_formats(DataFormat input_formats[NUM_OPERANDS], DataFormat output_formats[NUM_OPERANDS], DataFormat param_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS]);
-const DataFormat get_single_pack_src_format(DataFormat input_format, DataFormat output_format, DataFormat unpack_conditional_dst_format, bool fp32_dest_acc_en, tt::ARCH arch);
+const DataFormat get_single_pack_src_format(DataFormat input_format, DataFormat unpack_conditional_dst_format, bool fp32_dest_acc_en, tt::ARCH arch);
 
-std::vector<DataFormat> get_unpack_src_formats(DataFormat input_formats[NUM_OPERANDS], DataFormat param_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS]);
-std::vector<DataFormat> get_unpack_dst_formats(DataFormat input_formats[NUM_OPERANDS], DataFormat param_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS], DataFormat output_formats[NUM_OPERANDS], DataFormat unpack_conditional_dst_format, bool fp32_dest_acc_en, std::vector<UnpackToDestMode> unpack_to_dest_mode, bool int_fpu_en = false);
-std::vector<DataFormat> get_pack_src_formats(DataFormat input_formats[NUM_OPERANDS], DataFormat param_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS], DataFormat output_formats[NUM_OPERANDS], DataFormat unpack_conditional_dst_format, bool fp32_dest_acc_en, bool bfp8_pack_precise, bool int_fpu_en = false, tt::ARCH arch = tt::ARCH::GRAYSKULL);
-std::vector<DataFormat> get_pack_dst_formats(DataFormat input_formats[NUM_OPERANDS], DataFormat param_formats[NUM_OPERANDS], DataFormat intermed_formats[NUM_OPERANDS], DataFormat output_formats[NUM_OPERANDS]);
+std::vector<DataFormat> get_unpack_src_formats(DataFormat buf_formats[NUM_CIRCULAR_BUFFERS]);
+std::vector<DataFormat> get_unpack_dst_formats(DataFormat buf_formats[NUM_CIRCULAR_BUFFERS], DataFormat unpack_conditional_dst_format, bool fp32_dest_acc_en, std::vector<UnpackToDestMode> unpack_to_dest_mode, bool int_fpu_en = false);
+std::vector<DataFormat> get_pack_src_formats(DataFormat buf_formats[NUM_CIRCULAR_BUFFERS], DataFormat unpack_conditional_dst_format, bool fp32_dest_acc_en, bool bfp8_pack_precise, bool int_fpu_en = false, tt::ARCH arch = tt::ARCH::GRAYSKULL);
+std::vector<DataFormat> get_pack_dst_formats(DataFormat buf_formats[NUM_CIRCULAR_BUFFERS]);
 
 }
diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp
index 3bb4fd1e6b4..994a6efb755 100644
--- a/tt_metal/jit_build/genfiles.cpp
+++ b/tt_metal/jit_build/genfiles.cpp
@@ -13,6 +13,7 @@
 #include "common/utils.hpp"
 #include "hostdevcommon/common_values.hpp"
 #include "jit_build/build.hpp"
+#include "jit_build/data_format.hpp"
 #include "jit_build/settings.hpp"
 
 #include "tt_metal/hw/inc/circular_buffer.h"
@@ -151,59 +152,6 @@ void jit_build_genfiles_triscs_src(
     });
 }
 
-static std::pair<vector<DataFormat>, vector<DataFormat>> extend_unpack_data_format_vectors_to_all_cbs(
-    const vector<DataFormat>& src_formats, const vector<DataFormat>& dst_formats) {
-    // for the purposes of consistency and brevity of the LLK code that uses these arrays,
-    // extend unpack data formats to all 32 CBs
-    // [out0...out7] is missing from the vector, insert invalid (not used by the unpacker)
-
-    vector<DataFormat> src_formats_all_cbs;
-    vector<DataFormat> dst_formats_all_cbs;
-
-    // copy inputs and params
-    for (int i = 0; i < 16; i++) {
-        src_formats_all_cbs.push_back(src_formats[i]);
-        dst_formats_all_cbs.push_back(dst_formats[i]);
-    }
-
-    // insert invalid data format for output [out0...out7]
-    for (int i = 0; i < 8; i++) {
-        src_formats_all_cbs.push_back(DataFormat::Invalid);
-        dst_formats_all_cbs.push_back(DataFormat::Invalid);
-    }
-
-    // copy intermediates
-    for (int i = 0; i < 8; i++) {
-        src_formats_all_cbs.push_back(src_formats[16 + i]);
-        dst_formats_all_cbs.push_back(dst_formats[16 + i]);
-    }
-
-    return std::make_pair(src_formats_all_cbs, dst_formats_all_cbs);
-}
-
-static std::pair<vector<DataFormat>, vector<DataFormat>> extend_pack_data_format_vectors_to_all_cbs(
-    const vector<DataFormat>& src_formats, const vector<DataFormat>& dst_formats) {
-    // for the purposes of consistency and brevity of the LLK code that uses these arrays,
-    // extend pack data formats to all 32 CBs
-    // [in0...in7, param0...param7] are missing from the vector, insert invalid (not used by the unpacker)
-
-    vector<DataFormat> src_formats_all_cbs;
-    vector<DataFormat> dst_formats_all_cbs;
-
-    // insert invalid for inputs and params
-    for (int i = 0; i < 16; i++) {
-        src_formats_all_cbs.push_back(DataFormat::Invalid);
-        dst_formats_all_cbs.push_back(DataFormat::Invalid);
-    }
-
-    // copy outputs and intermediates
-    for (int i = 0; i < 16; i++) {
-        src_formats_all_cbs.push_back(src_formats[i]);
-        dst_formats_all_cbs.push_back(dst_formats[i]);
-    }
-
-    return std::make_pair(src_formats_all_cbs, dst_formats_all_cbs);
-}
 
 static std::string data_format_vec_to_string(const vector<DataFormat> formats) {
     std::string formats_string = "";
@@ -227,26 +175,15 @@ static std::string create_formats_array_string(
 static std::pair<std::vector<DataFormat>, std::vector<DataFormat>>
 generate_unpack_data_formats(tt_hlk_desc& desc, DataFormat unpack_conditional_dst_format, bool fp32_dest_acc_en, std::vector<UnpackToDestMode> unpack_to_dest_mode) {
 
-    vector<DataFormat> src_formats = tt::get_unpack_src_formats(
-        desc.input_buf_dataformat_arr, desc.param_buf_dataformat_arr, desc.intermediate_buf_dataformat_arr);
+    vector<DataFormat> src_formats = tt::get_unpack_src_formats(desc.buf_dataformat_arr);
 
     vector<DataFormat> dst_formats = tt::get_unpack_dst_formats(
-        desc.input_buf_dataformat_arr, desc.param_buf_dataformat_arr, desc.intermediate_buf_dataformat_arr,
-        desc.output_buf_dataformat_arr, unpack_conditional_dst_format, fp32_dest_acc_en, unpack_to_dest_mode);
-
-    TT_ASSERT(
-        src_formats.size() == 24 && dst_formats.size() == 24,
-        "There must be 8 unpack src/dst formats for each input, param, and intermediate operands.");
+        desc.buf_dataformat_arr, unpack_conditional_dst_format, fp32_dest_acc_en, unpack_to_dest_mode);
 
-    vector<DataFormat> src_formats_all_cbs;
-    vector<DataFormat> dst_formats_all_cbs;
-    tie(src_formats_all_cbs, dst_formats_all_cbs) =
-        extend_unpack_data_format_vectors_to_all_cbs(src_formats, dst_formats);
+    TT_ASSERT(src_formats.size() == NUM_CIRCULAR_BUFFERS);
+    TT_ASSERT(dst_formats.size() == NUM_CIRCULAR_BUFFERS);
 
-    TT_ASSERT(src_formats_all_cbs.size() == NUM_CIRCULAR_BUFFERS);
-    TT_ASSERT(dst_formats_all_cbs.size() == NUM_CIRCULAR_BUFFERS);
-
-    return std::make_pair(src_formats_all_cbs, dst_formats_all_cbs);
+    return std::make_pair(src_formats, dst_formats);
 }
 
 static void emit_unpack_data_formats(
@@ -273,10 +210,7 @@ static void emit_unpack_data_formats(
 static std::pair<std::vector<DataFormat>, std::vector<DataFormat>> generate_pack_data_formats(
     tt_hlk_desc& desc, DataFormat unpack_conditional_dst_format, bool fp32_dest_acc_en, bool bfp8_pack_precise, const tt::ARCH arch) {
     vector<DataFormat> src_formats = tt::get_pack_src_formats(
-        desc.input_buf_dataformat_arr,
-        desc.param_buf_dataformat_arr,
-        desc.intermediate_buf_dataformat_arr,
-        desc.output_buf_dataformat_arr,
+        desc.buf_dataformat_arr,
         unpack_conditional_dst_format,
         fp32_dest_acc_en,
         bfp8_pack_precise,
@@ -284,24 +218,12 @@ static std::pair<std::vector<DataFormat>, std::vector<DataFormat>> generate_pack
         arch);
 
     vector<DataFormat> dst_formats = tt::get_pack_dst_formats(
-        desc.input_buf_dataformat_arr,
-        desc.param_buf_dataformat_arr,
-        desc.intermediate_buf_dataformat_arr,
-        desc.output_buf_dataformat_arr);
-
-    TT_ASSERT(
-        src_formats.size() == 16 && dst_formats.size() == 16,
-        "There must be 8 pack src/dst formats for each output, and intermediate operands.");
-
-    vector<DataFormat> src_formats_all_cbs;
-    vector<DataFormat> dst_formats_all_cbs;
-    tie(src_formats_all_cbs, dst_formats_all_cbs) =
-        extend_pack_data_format_vectors_to_all_cbs(src_formats, dst_formats);
+        desc.buf_dataformat_arr);
 
-    TT_ASSERT(src_formats_all_cbs.size() == NUM_CIRCULAR_BUFFERS);
-    TT_ASSERT(dst_formats_all_cbs.size() == NUM_CIRCULAR_BUFFERS);
+    TT_ASSERT(src_formats.size() == NUM_CIRCULAR_BUFFERS);
+    TT_ASSERT(dst_formats.size() == NUM_CIRCULAR_BUFFERS);
 
-    return std::make_pair(src_formats_all_cbs, dst_formats_all_cbs);
+    return std::make_pair(src_formats, dst_formats);
 }
 
 static void emit_pack_data_formats(
@@ -368,31 +290,16 @@ static void generate_data_format_descriptors(JitBuildOptions& options, const tt:
     // assuming all cores within a op have the same desc
     tt_hlk_desc& desc = options.hlk_desc;
 
-    // Determine what the packformat should be
-    DataFormat pack_format =
-        tt::get_pack_data_format(desc.output_buf_dataformat_arr, desc.intermediate_buf_dataformat_arr);
-
     // Determine dst format under ambiguous conditions (either or both l1 input & output formats are Float32)
-    DataFormat unpack_conditional_dst_format = DataFormat::Invalid;
-    if (pack_format == DataFormat::Float32) {
-        ExpPrecision unpack_exp_prec = tt::get_data_exp_precision(desc.input_buf_dataformat_arr);
-        unpack_conditional_dst_format =
-            (unpack_exp_prec == ExpPrecision::A) ? DataFormat::Float16 : DataFormat::Float16_b;
-    } else {
-        ExpPrecision pack_exp_prec = tt::get_data_exp_precision(desc.output_buf_dataformat_arr);
-        unpack_conditional_dst_format =
-            (pack_exp_prec == ExpPrecision::A) ? DataFormat::Float16 : DataFormat::Float16_b;
-    }
+    ExpPrecision exp_prec = tt::get_data_exp_precision(desc.buf_dataformat_arr);
+    DataFormat unpack_conditional_dst_format = (exp_prec == ExpPrecision::A) ? DataFormat::Float16 : DataFormat::Float16_b;
 
-    if (tt::is_all_fp32_formats(desc.input_buf_dataformat_arr) && options.fp32_dest_acc_en) {
+    if (options.fp32_dest_acc_en && (tt::is_all_fp32_formats(desc.buf_dataformat_arr) || (exp_prec == ExpPrecision::B))) {
         unpack_conditional_dst_format = DataFormat::Tf32;
     }
 
-    tt::check_valid_in_out_data_formats(
-        desc.input_buf_dataformat_arr,
-        desc.output_buf_dataformat_arr,
-        desc.param_buf_dataformat_arr,
-        desc.intermediate_buf_dataformat_arr);
+    tt::check_valid_formats_in_out_data_formats(
+        desc.buf_dataformat_arr);
 
     vector<DataFormat> unpack_src_formats_all_cbs, unpack_dst_formats_all_cbs;
     tie(unpack_src_formats_all_cbs, unpack_dst_formats_all_cbs) = generate_unpack_data_formats(desc, unpack_conditional_dst_format, options.fp32_dest_acc_en, options.unpack_to_dest_mode);
diff --git a/tt_metal/jit_build/hlk_desc.hpp b/tt_metal/jit_build/hlk_desc.hpp
index cb933b8493a..395d533343c 100644
--- a/tt_metal/jit_build/hlk_desc.hpp
+++ b/tt_metal/jit_build/hlk_desc.hpp
@@ -6,6 +6,7 @@
 
 #include <string>
 
+#include "circular_buffer.h"         // for NUM_CIRCULAR_BUFFERS
 #include "hostdevcommon/kernel_structs.h"
 #include "tt_metal/common/assert.hpp"
 #include "tt_metal/common/base_types.hpp"
@@ -29,17 +30,14 @@ class tt_hlk_desc
     size_t hlk_args_size;  // size of hlk_args_t in bytes (result of sizeof())
 
    public:
-    DataFormat input_buf_dataformat_arr[8];
-    DataFormat param_buf_dataformat_arr[8];
-    DataFormat output_buf_dataformat_arr[8];
-    DataFormat intermediate_buf_dataformat_arr[8];
-    uint32_t buf_num_faces_arr[32];
-    uint32_t buf_partial_face_arr[32];
-    uint32_t buf_face_r_dim_arr[32];
-    uint32_t buf_narrow_tile_arr[32];
-    uint32_t buf_tile_r_dim_arr[32];
-    uint32_t buf_tile_c_dim_arr[32];
-    uint32_t buf_tile_size_arr[32];
+    DataFormat buf_dataformat_arr[NUM_CIRCULAR_BUFFERS];
+    uint32_t buf_num_faces_arr[NUM_CIRCULAR_BUFFERS];
+    uint32_t buf_partial_face_arr[NUM_CIRCULAR_BUFFERS];
+    uint32_t buf_face_r_dim_arr[NUM_CIRCULAR_BUFFERS];
+    uint32_t buf_narrow_tile_arr[NUM_CIRCULAR_BUFFERS];
+    uint32_t buf_tile_r_dim_arr[NUM_CIRCULAR_BUFFERS];
+    uint32_t buf_tile_c_dim_arr[NUM_CIRCULAR_BUFFERS];
+    uint32_t buf_tile_size_arr[NUM_CIRCULAR_BUFFERS];
 
     tt_hlk_desc() {
         math_fidelity = MathFidelity::Invalid;
@@ -47,16 +45,9 @@ class tt_hlk_desc
         hlk_args_size = 0;
         approximation_mode = true;
 
-        for (int i = 0; i < 8; ++i)
-        {
-            input_buf_dataformat_arr[i] = DataFormat::Invalid;
-            param_buf_dataformat_arr[i] = DataFormat::Invalid;
-            output_buf_dataformat_arr[i] = DataFormat::Invalid;
-            intermediate_buf_dataformat_arr[i] = DataFormat::Invalid;
-        }
-
-        for (int i = 0; i < 32; ++i)
+        for (int i = 0; i < NUM_CIRCULAR_BUFFERS; ++i)
         {
+            buf_dataformat_arr[i] = DataFormat::Invalid;
             buf_num_faces_arr[i] = constants::TILE_HW / constants::FACE_HW;
             buf_partial_face_arr[i] = 0;
             buf_face_r_dim_arr[i] = constants::FACE_HEIGHT;
@@ -69,16 +60,9 @@ class tt_hlk_desc
 
     tt_hlk_desc(tt_hlk_desc &in)
     {
-        for(int i=0;i<8;++i)
-        {
-            input_buf_dataformat_arr[i]  = in.input_buf_dataformat_arr[i] ;
-            param_buf_dataformat_arr[i] = in.param_buf_dataformat_arr[i] ;
-            output_buf_dataformat_arr[i] = in.output_buf_dataformat_arr[i];
-            intermediate_buf_dataformat_arr[i] = in.intermediate_buf_dataformat_arr[i];
-        }
-
-        for (int i = 0; i < 32; ++i)
+        for(int i=0;i<NUM_CIRCULAR_BUFFERS;++i)
         {
+            buf_dataformat_arr[i]  = in.buf_dataformat_arr[i] ;
             buf_num_faces_arr[i] = in.buf_num_faces_arr[i];
             buf_partial_face_arr[i] = in.buf_partial_face_arr[i];
             buf_face_r_dim_arr[i] = in.buf_face_r_dim_arr[i];
@@ -94,44 +78,14 @@ class tt_hlk_desc
         approximation_mode = in.approximation_mode;
     }
 
-    DataFormat get_input_buf_dataformat(int buf_idx) const
+    DataFormat get_buf_dataformat(int buf_idx) const
     {
-        return input_buf_dataformat_arr[buf_idx];
+        return buf_dataformat_arr[buf_idx];
     }
 
-    void set_input_buf_dataformat(int buf_idx, DataFormat data_format)
+    void set_buf_dataformat(int buf_idx, DataFormat data_format)
     {
-        input_buf_dataformat_arr[buf_idx] = data_format;
-    }
-
-    DataFormat get_param_buf_dataformat(int buf_idx) const
-    {
-        return param_buf_dataformat_arr[buf_idx];
-    }
-
-    void set_param_buf_dataformat(int buf_idx, DataFormat data_format)
-    {
-        param_buf_dataformat_arr[buf_idx] = data_format;
-    }
-
-    DataFormat get_output_buf_dataformat(int buf_idx) const
-    {
-        return output_buf_dataformat_arr[buf_idx];
-    }
-
-    void set_output_buf_dataformat(int buf_idx, DataFormat data_format)
-    {
-        output_buf_dataformat_arr[buf_idx] = data_format;
-    }
-
-    DataFormat get_intermediate_buf_dataformat(int buf_idx) const
-    {
-        return intermediate_buf_dataformat_arr[buf_idx];
-    }
-
-    void set_intermediate_buf_dataformat(int buf_idx, DataFormat data_format)
-    {
-        intermediate_buf_dataformat_arr[buf_idx] = data_format;
+        buf_dataformat_arr[buf_idx] = data_format;
     }
 
     uint32_t get_buf_num_faces(int buf_idx) const
@@ -241,25 +195,12 @@ class tt_hlk_desc
         return hlk_args_size;
     }
 
-    const DataFormat* get_input_buf_dataformats() const
+    const DataFormat* get_buf_dataformats() const
     {
-        return input_buf_dataformat_arr;
+        return buf_dataformat_arr;
     }
 
-    const DataFormat* get_param_buf_dataformats() const
-    {
-        return param_buf_dataformat_arr;
-    }
-
-    const DataFormat* get_output_buf_dataformats() const
-    {
-        return output_buf_dataformat_arr;
-    }
 
-    const DataFormat* get_intermediate_buf_dataformats() const
-    {
-        return intermediate_buf_dataformat_arr;
-    }
 };  // tt_hlk_desc
 }  // namespace tt
 
@@ -279,20 +220,14 @@ struct std::hash<tt::tt_hlk_desc>
     std::size_t operator()(tt::tt_hlk_desc const& obj) const noexcept
     {
         std::size_t hash_value = 0;
-        for (int i = 0; i < 8; i++)
-        {
-            tt::utils::hash_combine(hash_value, hash<tt::DataFormat>{}(obj.get_input_buf_dataformat(i)));
-            tt::utils::hash_combine(hash_value, hash<tt::DataFormat>{}(obj.get_param_buf_dataformat(i)));
-            tt::utils::hash_combine(hash_value, hash<tt::DataFormat>{}(obj.get_output_buf_dataformat(i)));
-            tt::utils::hash_combine(hash_value, hash<tt::DataFormat>{}(obj.get_intermediate_buf_dataformat(i)));
-        }
-        tt::utils::hash_combine(hash_value, hash<MathFidelity>{}(obj.get_hlk_math_fidelity()));
-        tt::utils::hash_combine(hash_value, hash<bool>{}(obj.get_hlk_math_approx_mode()));
-        for (int i = 0; i < 32; i++)
+        for (int i = 0; i < NUM_CIRCULAR_BUFFERS; i++)
         {
+            tt::utils::hash_combine(hash_value, hash<tt::DataFormat>{}(obj.get_buf_dataformat(i)));
             tt::utils::hash_combine(hash_value, hash<uint32_t>{}(obj.get_buf_tile_r_dim(i)));
             tt::utils::hash_combine(hash_value, hash<uint32_t>{}(obj.get_buf_tile_c_dim(i)));
         }
+        tt::utils::hash_combine(hash_value, hash<MathFidelity>{}(obj.get_hlk_math_fidelity()));
+        tt::utils::hash_combine(hash_value, hash<bool>{}(obj.get_hlk_math_approx_mode()));
 
         // Get hash for hlk_args here
         void *hlk_args = obj.get_hlk_args();
diff --git a/tt_metal/jit_build/settings.cpp b/tt_metal/jit_build/settings.cpp
index d5f18dc2036..0112c1f7136 100644
--- a/tt_metal/jit_build/settings.cpp
+++ b/tt_metal/jit_build/settings.cpp
@@ -36,11 +36,11 @@ namespace tt::tt_metal
         hlk_desc.set_hlk_args(args, size);
     }
 
-    void JitBuildOptions::set_cb_dataformat_all_cores(CB cb_id, DataFormat data_format) {
+    void JitBuildOptions::set_cb_dataformat_all_cores(CBIndex cb_id, DataFormat data_format) {
         set_hlk_operand_dataformat_all_cores((HlkOperand)cb_id, data_format);
     }
 
-    void JitBuildOptions::set_cb_tile_dims_all_cores(CB cb_id, uint32_t num_faces, uint32_t partial_face, uint32_t face_r_dim, uint32_t narrow_tile, uint32_t tile_r_dim, uint32_t tile_c_dim) {
+    void JitBuildOptions::set_cb_tile_dims_all_cores(CBIndex cb_id, uint32_t num_faces, uint32_t partial_face, uint32_t face_r_dim, uint32_t narrow_tile, uint32_t tile_r_dim, uint32_t tile_c_dim) {
         hlk_desc.set_buf_num_faces((int)cb_id, num_faces);
         hlk_desc.set_buf_partial_face((int)cb_id, partial_face);
         hlk_desc.set_buf_face_r_dim((int)cb_id, face_r_dim);
@@ -49,27 +49,13 @@ namespace tt::tt_metal
         hlk_desc.set_buf_tile_c_dim((int)cb_id, tile_c_dim);
     }
 
-    void JitBuildOptions::set_cb_tile_size_all_cores(CB cb_id, uint32_t tile_size) {
+    void JitBuildOptions::set_cb_tile_size_all_cores(CBIndex cb_id, uint32_t tile_size) {
         hlk_desc.set_buf_tile_size((int)cb_id, tile_size);
     }
 
     void JitBuildOptions::set_hlk_operand_dataformat_all_cores(HlkOperand op_id, DataFormat data_format)
     {
-        static_assert(HlkOperand::in7 == int(HlkOperand::param0)-1);
-        static_assert(HlkOperand::param7 == int(HlkOperand::out0)-1);
-        static_assert(HlkOperand::out7 == int(HlkOperand::intermed0)-1);
-        if (op_id <= HlkOperand::in7) {
-            hlk_desc.set_input_buf_dataformat((int)op_id, data_format);
-        } else if (op_id <= HlkOperand::param7) {
-            hlk_desc.set_param_buf_dataformat((int)op_id - ((int)HlkOperand::in7+1), data_format);
-        } else if (op_id <= HlkOperand::out7) {
-            hlk_desc.set_output_buf_dataformat((int)op_id - ((int)HlkOperand::param7+1), data_format);
-        } else if (op_id <= HlkOperand::intermed7) {
-            hlk_desc.set_intermediate_buf_dataformat((int)op_id - ((int)HlkOperand::out7+1), data_format);
-        } else {
-            std::cout << "Error: incorrect operand identifier" << std::endl;
-            TT_ASSERT(false);
-        }
+        hlk_desc.set_buf_dataformat((int)op_id, data_format);
     }
 
 } // end namespace tt
diff --git a/tt_metal/jit_build/settings.hpp b/tt_metal/jit_build/settings.hpp
index f5cac441b4f..25b7cd978b5 100644
--- a/tt_metal/jit_build/settings.hpp
+++ b/tt_metal/jit_build/settings.hpp
@@ -37,9 +37,9 @@ class JitBuildOptions {
     void set_hlk_math_approx_mode_all_cores(bool approx_mode);
     void set_hlk_args_all_cores(void* args, size_t size);
 
-    void set_cb_dataformat_all_cores(CB cb_id, DataFormat data_format);
-    void set_cb_tile_dims_all_cores(CB cb_id, uint32_t num_faces, uint32_t partial_face, uint32_t face_r_dim, uint32_t narrow_tile, uint32_t tile_r_dim, uint32_t tile_c_dim);
-    void set_cb_tile_size_all_cores(CB cb_id, uint32_t tile_size);
+    void set_cb_dataformat_all_cores(CBIndex cb_id, DataFormat data_format);
+    void set_cb_tile_dims_all_cores(CBIndex cb_id, uint32_t num_faces, uint32_t partial_face, uint32_t face_r_dim, uint32_t narrow_tile, uint32_t tile_r_dim, uint32_t tile_c_dim);
+    void set_cb_tile_size_all_cores(CBIndex cb_id, uint32_t tile_size);
     // old API name
     void set_hlk_operand_dataformat_all_cores(HlkOperand op_id, DataFormat data_format);
 };
diff --git a/tt_metal/kernels/compute/eltwise_binary.cpp b/tt_metal/kernels/compute/eltwise_binary.cpp
index 3695359ef1d..3da06bb0d0b 100644
--- a/tt_metal/kernels/compute/eltwise_binary.cpp
+++ b/tt_metal/kernels/compute/eltwise_binary.cpp
@@ -14,13 +14,13 @@ void MAIN {
     uint32_t per_core_block_size = get_arg_val<uint32_t>(1);
     uint32_t acc_to_dst = get_arg_val<uint32_t>(2);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
     constexpr auto cb_inp0 = cb_in0;
     constexpr auto cb_inp1 = cb_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
 
-    constexpr auto cb_in2 = tt::CB::c_in2;
+    constexpr auto cb_in2 = tt::CBIndex::c_2;
 
     binary_op_init_common(cb_inp0, cb_inp1, cb_out0);
 
diff --git a/tt_metal/kernels/compute/eltwise_sfpu.cpp b/tt_metal/kernels/compute/eltwise_sfpu.cpp
index 7edb1319438..93f88644e3c 100644
--- a/tt_metal/kernels/compute/eltwise_sfpu.cpp
+++ b/tt_metal/kernels/compute/eltwise_sfpu.cpp
@@ -13,28 +13,28 @@ void MAIN {
     uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
     uint32_t per_core_block_dim = get_compile_time_arg_val(1);
 
-    init_sfpu(tt::CB::c_in0);
+    init_sfpu(tt::CBIndex::c_0, tt::CBIndex::c_16);
     for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) {
-        cb_reserve_back(tt::CB::c_out0, per_core_block_dim);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_dim);
         for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
             acquire_dst();
 
             // Pop tile after tile, copy to DST and pack
-            cb_wait_front(tt::CB::c_in0, 1);
+            cb_wait_front(tt::CBIndex::c_0, 1);
 
-            copy_tile(tt::CB::c_in0, 0, 0);
+            copy_tile(tt::CBIndex::c_0, 0, 0);
 
             #ifdef SFPU_OP_CHAIN_0
             SFPU_OP_CHAIN_0
             #endif
 
-            pack_tile(0, tt::CB::c_out0);
+            pack_tile(0, tt::CBIndex::c_16);
 
-            cb_pop_front(tt::CB::c_in0, 1);
+            cb_pop_front(tt::CBIndex::c_0, 1);
 
             release_dst();
         }
-        cb_push_back(tt::CB::c_out0, per_core_block_dim);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_dim);
     }
 
 }
diff --git a/tt_metal/kernels/dataflow/writer_unary.cpp b/tt_metal/kernels/dataflow/writer_unary.cpp
index 3d6404e8c71..49a26473b65 100644
--- a/tt_metal/kernels/dataflow/writer_unary.cpp
+++ b/tt_metal/kernels/dataflow/writer_unary.cpp
@@ -10,7 +10,7 @@ void kernel_main() {
     uint32_t dst_noc_y = get_arg_val<uint32_t>(2);
     uint32_t num_tiles = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t cb_id_out0 = 16;
+    constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16;
 
     // single-tile ublocks
     uint32_t ublock_size_bytes = get_tile_size(cb_id_out0);
diff --git a/tt_metal/llrt/blackhole/bh_hal.cpp b/tt_metal/llrt/blackhole/bh_hal.cpp
index c0bf3c3e4c8..ae0bbd76170 100644
--- a/tt_metal/llrt/blackhole/bh_hal.cpp
+++ b/tt_metal/llrt/blackhole/bh_hal.cpp
@@ -6,6 +6,7 @@
 #include <cstdint>
 
 #include "core_config.h" // ProgrammableCoreType
+#include "dev_mem_map.h"
 #include "noc/noc_parameters.h"
 
 #include "hal.hpp"
@@ -43,6 +44,20 @@ void Hal::initialize_bh() {
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::L1)] = L1_ALIGNMENT;
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::DRAM)] = DRAM_ALIGNMENT;
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::HOST)] = PCIE_ALIGNMENT;
+
+    this->relocate_func_ = [](uint64_t addr, uint64_t local_init_addr) {
+        if ((addr & MEM_LOCAL_BASE) == MEM_LOCAL_BASE) {
+            // Move addresses in the local memory range to l1 (copied by kernel)
+            return (addr & ~MEM_LOCAL_BASE) + local_init_addr;
+        }
+
+        // Note: Blackhole does not have IRAM
+
+        // No relocation needed
+        return addr;
+    };
+
+
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp
index 21ea11a40b8..302412be970 100644
--- a/tt_metal/llrt/grayskull/gs_hal.cpp
+++ b/tt_metal/llrt/grayskull/gs_hal.cpp
@@ -129,6 +129,20 @@ void Hal::initialize_gs() {
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::L1)] = L1_ALIGNMENT;
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::DRAM)] = DRAM_ALIGNMENT;
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::HOST)] = PCIE_ALIGNMENT;
+
+    this->relocate_func_ = [](uint64_t addr, uint64_t local_init_addr) {
+        if ((addr & MEM_LOCAL_BASE) == MEM_LOCAL_BASE) {
+            // Move addresses in the local memory range to l1 (copied by kernel)
+            return (addr & ~MEM_LOCAL_BASE) + local_init_addr;
+        }
+        else if ((addr & MEM_NCRISC_IRAM_BASE) == MEM_NCRISC_IRAM_BASE) {
+            // Move addresses in the NCRISC memory range to l1 (copied by kernel)
+            return (addr & ~MEM_NCRISC_IRAM_BASE) + MEM_NCRISC_INIT_IRAM_L1_BASE;
+        }
+
+        // No relocation needed
+        return addr;
+    };
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/llrt/hal.hpp
index 784790f5ba7..6b8155896eb 100644
--- a/tt_metal/llrt/hal.hpp
+++ b/tt_metal/llrt/hal.hpp
@@ -10,6 +10,7 @@
 //
 
 #include <cstdint>
+#include <functional>
 #include <variant>
 #include <vector>
 #include <memory>
@@ -143,6 +144,10 @@ inline T HalCoreInfoType::get_binary_local_init_addr(uint32_t processor_class_id
 }
 
 class Hal {
+
+  public:
+    using RelocateFunc = std::function<uint64_t(uint64_t, uint64_t)>;
+
   private:
     tt::ARCH arch_;
     std::vector<HalCoreInfoType> core_info_;
@@ -154,6 +159,9 @@ class Hal {
     void initialize_wh();
     void initialize_bh();
 
+    // Functions where implementation varies by architecture
+    RelocateFunc relocate_func_;
+
   public:
     Hal();
 
@@ -195,6 +203,11 @@ class Hal {
     T get_base_firmware_addr(uint32_t programmable_core_type_index, uint32_t processor_class_idx, uint32_t processor_type_idx) const;
     template <typename T = DeviceAddr>
     T get_binary_local_init_addr(uint32_t programmable_core_type_index, uint32_t processor_class_idx, uint32_t processor_type_idx) const;
+
+    uint64_t relocate_dev_addr(uint64_t addr, uint64_t local_init_addr = 0) {
+        return relocate_func_(addr, local_init_addr);
+    }
+
 };
 
 inline uint32_t Hal::get_programmable_core_type_count() const {
diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp
index 15b70b1119f..8eab5caf742 100644
--- a/tt_metal/llrt/llrt.cpp
+++ b/tt_metal/llrt/llrt.cpp
@@ -138,7 +138,7 @@ ll_api::memory read_mem_from_core(chip_id_t chip, const CoreCoord &core, const l
 
     ll_api::memory read_mem;
     read_mem.fill_from_mem_template(mem, [&](std::vector<uint32_t>::iterator mem_ptr, uint64_t addr, uint32_t len) {
-        uint64_t relo_addr = relocate_dev_addr(addr, local_init_addr);
+        uint64_t relo_addr = tt::tt_metal::hal.relocate_dev_addr(addr, local_init_addr);
         tt::Cluster::instance().read_core(&*mem_ptr, len * sizeof(uint32_t), tt_cxy_pair(chip, core), relo_addr);
     });
     return read_mem;
@@ -185,7 +185,7 @@ bool test_load_write_read_risc_binary(
 
     log_debug(tt::LogLLRuntime, "hex_vec size = {}, size_in_bytes = {}", mem.size(), mem.size()*sizeof(uint32_t));
     mem.process_spans([&](std::vector<uint32_t>::const_iterator mem_ptr, uint64_t addr, uint32_t len_words) {
-        uint64_t relo_addr = relocate_dev_addr(addr, local_init_addr);
+        uint64_t relo_addr = tt::tt_metal::hal.relocate_dev_addr(addr, local_init_addr);
 
         tt::Cluster::instance().write_core(&*mem_ptr, len_words * sizeof(uint32_t), tt_cxy_pair(chip_id, core), relo_addr);
     });
diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/llrt/llrt.hpp
index 8560f515536..d2ce5bb2597 100644
--- a/tt_metal/llrt/llrt.hpp
+++ b/tt_metal/llrt/llrt.hpp
@@ -120,24 +120,6 @@ void wait_until_cores_done(
 
 }  // namespace internal_
 
-inline uint64_t relocate_dev_addr(uint64_t addr, uint64_t local_init_addr = 0) {
-    uint64_t relo_addr;
-    if ((addr & MEM_LOCAL_BASE) == MEM_LOCAL_BASE) {
-        // Move addresses in the local memory range to l1 (copied by kernel)
-        relo_addr = (addr & ~MEM_LOCAL_BASE) + local_init_addr;
-    }
-#ifdef NCRISC_HAS_IRAM
-    else if ((addr & MEM_NCRISC_IRAM_BASE) == MEM_NCRISC_IRAM_BASE) {
-        // Move addresses in the trisc memory range to l1 (copied by kernel)
-        relo_addr = (addr & ~MEM_NCRISC_IRAM_BASE) + MEM_NCRISC_INIT_IRAM_L1_BASE;
-    }
-#endif
-    else {
-        relo_addr = addr;
-    }
-    return relo_addr;
-}
-
 }  // namespace llrt
 
 }  // namespace tt
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index c29d8d1d5b8..114ac7353b7 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -128,8 +128,16 @@ void Cluster::generate_cluster_descriptor() {
 
     // Cluster descriptor yaml not available for Blackhole bring up
     if (this->target_type_ == TargetDevice::Simulator) {
-        // Passing simulator reported physical devices as logical devices.
-        this->cluster_desc_ = tt_ClusterDescriptor::create_mock_cluster(tt_SimulationDevice::detect_available_device_ids(), this->arch_);
+        // Cannot use tt::umd::Cluster::detect_available_device_ids because that returns physical device IDs
+        std::vector<chip_id_t> physical_mmio_device_ids;
+        std::set<chip_id_t> logical_mmio_device_ids;
+        physical_mmio_device_ids = tt_SimulationDevice::detect_available_device_ids();
+        for (chip_id_t logical_mmio_device_id = 0; logical_mmio_device_id < physical_mmio_device_ids.size();
+             logical_mmio_device_id++) {
+            logical_mmio_device_ids.insert(logical_mmio_device_id);
+        }
+        this->cluster_desc_ =
+            tt_ClusterDescriptor::create_for_grayskull_cluster(logical_mmio_device_ids, physical_mmio_device_ids);
     } else {
         this->cluster_desc_ = tt_ClusterDescriptor::create_from_yaml(this->cluster_desc_path_);
         for (const auto &chip_id : this->cluster_desc_->get_all_chips()) {
diff --git a/tt_metal/llrt/wormhole/wh_hal.cpp b/tt_metal/llrt/wormhole/wh_hal.cpp
index 6f2edb3b069..6f2c449a061 100644
--- a/tt_metal/llrt/wormhole/wh_hal.cpp
+++ b/tt_metal/llrt/wormhole/wh_hal.cpp
@@ -6,6 +6,7 @@
 #include <cstdint>
 
 #include "core_config.h" // ProgrammableCoreType
+#include "dev_mem_map.h" // MEM_LOCAL_BASE
 #include "noc/noc_parameters.h"
 
 #include "hal.hpp"
@@ -43,6 +44,21 @@ void Hal::initialize_wh() {
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::L1)] = L1_ALIGNMENT;
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::DRAM)] = DRAM_ALIGNMENT;
     this->mem_alignments_[static_cast<std::size_t>(HalMemType::HOST)] = PCIE_ALIGNMENT;
+
+    this->relocate_func_ = [](uint64_t addr, uint64_t local_init_addr) {
+        if ((addr & MEM_LOCAL_BASE) == MEM_LOCAL_BASE) {
+            // Move addresses in the local memory range to l1 (copied by kernel)
+            return (addr & ~MEM_LOCAL_BASE) + local_init_addr;
+        }
+        else if ((addr & MEM_NCRISC_IRAM_BASE) == MEM_NCRISC_IRAM_BASE) {
+            // Move addresses in the NCRISC memory range to l1 (copied by kernel)
+            return (addr & ~MEM_NCRISC_IRAM_BASE) + MEM_NCRISC_INIT_IRAM_L1_BASE;
+        }
+
+        // No relocation needed
+        return addr;
+    };
+
 }
 
 }  // namespace tt_metal
diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
index 80093cb45c1..0796ddb4ceb 100644
--- a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp
@@ -41,16 +41,16 @@ int main(int argc, char **argv) {
     uint32_t dst_dram_noc_y = dst_dram_noc_coord.y;
 
     /* Use L1 circular buffers to set input and output buffers that the compute engine will use */
-    constexpr uint32_t src0_cb_index = CB::c_in0;
+    constexpr uint32_t src0_cb_index = CBIndex::c_0;
     constexpr uint32_t num_input_tiles = 1;
     CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
     CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    constexpr uint32_t src1_cb_index = CB::c_in1;
+    constexpr uint32_t src1_cb_index = CBIndex::c_1;
     CircularBufferConfig cb_src1_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src1_cb_index, single_tile_size);
     CBHandle cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-    constexpr uint32_t output_cb_index = CB::c_out0;
+    constexpr uint32_t output_cb_index = CBIndex::c_16;
     constexpr uint32_t num_output_tiles = 1;
     CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
     CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/compute/add_2_tiles.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/compute/add_2_tiles.cpp
index 285362edeab..3f8ed4ff3fe 100644
--- a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/compute/add_2_tiles.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/compute/add_2_tiles.cpp
@@ -8,9 +8,9 @@
 
 namespace NAMESPACE {
 void MAIN {
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 =  tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 =  tt::CBIndex::c_16;
 
     binary_op_init_common(cb_in0, cb_in1, cb_out0);
     add_tiles_init();
@@ -39,17 +39,17 @@ void MAIN {
     /*
     acquire_dst();
 
-    cb_wait_front(tt::CB::c_in0, 1);
-    cb_wait_front(tt::CB::c_in1, 1);
+    cb_wait_front(tt::CBIndex::c_0, 1);
+    cb_wait_front(tt::CBIndex::c_1, 1);
 
-    add_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
+    add_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
 
-    cb_pop_front(tt::CB::c_in0, 1);
-    cb_pop_front(tt::CB::c_in1, 1);
+    cb_pop_front(tt::CBIndex::c_0, 1);
+    cb_pop_front(tt::CBIndex::c_1, 1);
 
-    cb_reserve_back(tt::CB::c_out0, 1);
-    pack_tile(0, tt::CB::c_out0);
-    cb_push_back(tt::CB::c_out0, 1);
+    cb_reserve_back(tt::CBIndex::c_16, 1);
+    pack_tile(0, tt::CBIndex::c_16);
+    cb_push_back(tt::CBIndex::c_16, 1);
 
     release_dst();
     */
diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp
index bd3f9b6a703..a0b992403e5 100644
--- a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp
@@ -16,8 +16,8 @@ void kernel_main() {
     uint64_t src0_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src0_addr);
     uint64_t src1_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_addr);
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_id_in1 = tt::CB::c_in1;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1;
 
     // single-tile ublocks
     uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0);
diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp
index 920f757a197..f720b3d099e 100644
--- a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp
@@ -11,7 +11,7 @@ void kernel_main() {
 
     uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr);
 
-    constexpr uint32_t cb_id_out0 = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16;
     uint32_t ublock_size_bytes = get_tile_size(cb_id_out0);
     uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
 
diff --git a/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp b/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp
index 6c46decd81f..2c41cc6e728 100644
--- a/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp
@@ -48,11 +48,11 @@ int main(int argc, char **argv) {
     EnqueueWriteBuffer(cq, src1_dram_buffer, src1_vec, false);
 
     /* Use L1 circular buffers to set input buffers */
-    constexpr uint32_t src0_cb_index = CB::c_in0;
+    constexpr uint32_t src0_cb_index = CBIndex::c_0;
     CircularBufferConfig cb_src0_config = CircularBufferConfig(single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
     CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    constexpr uint32_t src1_cb_index = CB::c_in1;
+    constexpr uint32_t src1_cb_index = CBIndex::c_1;
     CircularBufferConfig cb_src1_config = CircularBufferConfig(single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src1_cb_index, single_tile_size);
     CBHandle cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
diff --git a/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp b/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp
index 286c90bdb28..9fe88e16991 100644
--- a/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp
+++ b/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp
@@ -18,8 +18,8 @@ void kernel_main() {
     uint64_t src1_dram_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_dram);
     uint64_t dst_dram_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_dram);
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0; // index=0
-    constexpr uint32_t cb_id_in1 = tt::CB::c_in1; // index=1
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; // index=0
+    constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; // index=1
 
     // single-tile ublocks
     uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0);
diff --git a/tt_metal/programming_examples/contributed/vecadd/kernels/add.cpp b/tt_metal/programming_examples/contributed/vecadd/kernels/add.cpp
index 368259e96b1..856bd840345 100644
--- a/tt_metal/programming_examples/contributed/vecadd/kernels/add.cpp
+++ b/tt_metal/programming_examples/contributed/vecadd/kernels/add.cpp
@@ -13,10 +13,10 @@ void MAIN {
     uint32_t n_tiles = get_arg_val<uint32_t>(0);
 
     // We are going to read from these two circular buffers
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
     // and write to the output circular buffer
-    constexpr auto cb_out0 =  tt::CB::c_out0;
+    constexpr auto cb_out0 =  tt::CBIndex::c_16;
     // The destination register.
     // Quote the doc: "This register is an array of 16 tiles of 32x32 elements each."
     // If you are fimilar with the concept of rotating register file from computer
diff --git a/tt_metal/programming_examples/contributed/vecadd/kernels/interleaved_tile_read.cpp b/tt_metal/programming_examples/contributed/vecadd/kernels/interleaved_tile_read.cpp
index 67a89d4c55a..b88bf7e7f05 100644
--- a/tt_metal/programming_examples/contributed/vecadd/kernels/interleaved_tile_read.cpp
+++ b/tt_metal/programming_examples/contributed/vecadd/kernels/interleaved_tile_read.cpp
@@ -12,8 +12,8 @@ void kernel_main()
     uint32_t n_tiles = get_arg_val<uint32_t>(2);
 
     // The circular buffers to read the tiles into
-    constexpr uint32_t cb_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_in1 = tt::CB::c_in1;
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_in1 = tt::CBIndex::c_1;
 
     // Get the tile size used in the circular buffers. We assume the
     // circular buffers are created with the same tile size as the DRAM
diff --git a/tt_metal/programming_examples/contributed/vecadd/kernels/tile_write.cpp b/tt_metal/programming_examples/contributed/vecadd/kernels/tile_write.cpp
index e515e1fe88d..7e0d5028a6b 100644
--- a/tt_metal/programming_examples/contributed/vecadd/kernels/tile_write.cpp
+++ b/tt_metal/programming_examples/contributed/vecadd/kernels/tile_write.cpp
@@ -10,7 +10,7 @@ void kernel_main()
     uint32_t n_tiles = get_arg_val<uint32_t>(1);
 
     // The circular buffer that we are going to read from and write to DRAM
-    constexpr uint32_t cb_out0 = tt::CB::c_out0;
+    constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
     const uint32_t tile_size_bytes = get_tile_size(cb_out0);
 
     // Address generator for the output buffer. This is faster than doing plain DRAM writes.
diff --git a/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp b/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
index 9e3fc07776a..463a702831b 100644
--- a/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
+++ b/tt_metal/programming_examples/contributed/vecadd/vecadd.cpp
@@ -45,7 +45,7 @@ std::shared_ptr<Buffer> MakeBufferBFP16(Device *device, uint32_t n_tiles, bool s
     return MakeBuffer(device, tile_size * n_tiles, page_tiles * tile_size, sram);
 }
 
-CBHandle MakeCircularBuffer(Program& program, const CoreSpec& core, tt::CB cb, uint32_t size, uint32_t page_size, tt::DataFormat format)
+CBHandle MakeCircularBuffer(Program& program, const CoreSpec& core, tt::CBIndex cb, uint32_t size, uint32_t page_size, tt::DataFormat format)
 {
     CircularBufferConfig cb_src0_config = CircularBufferConfig(
         size,
@@ -64,7 +64,7 @@ CBHandle MakeCircularBuffer(Program& program, const CoreSpec& core, tt::CB cb, u
 // @param core: The core to create the circular buffer on.
 // @param cb: Which circular buffer to create (c_in0, c_in1, c_out0, c_out1, etc..). This is just an ID
 // @param n_tiles: The number of tiles the circular buffer can hold.
-CBHandle MakeCircularBufferBFP16(Program& program, const CoreSpec& core, tt::CB cb, uint32_t n_tiles)
+CBHandle MakeCircularBufferBFP16(Program& program, const CoreSpec& core, tt::CBIndex cb, uint32_t n_tiles)
 {
     constexpr uint32_t tile_size = sizeof(bfloat16) * TILE_WIDTH * TILE_HEIGHT;
     return MakeCircularBuffer(program, core, cb, n_tiles * tile_size, tile_size, tt::DataFormat::Float16_b);
@@ -134,9 +134,9 @@ int main(int argc, char **argv)
 
     const uint32_t tiles_per_cb = 4;
     // Create 3 circular buffers. These will be used by the data movement kernels to stream data into the compute cores and for the compute cores to stream data out.
-    CBHandle cb_a = MakeCircularBufferBFP16(program, core, tt::CB::c_in0, tiles_per_cb);
-    CBHandle cb_b = MakeCircularBufferBFP16(program, core, tt::CB::c_in1, tiles_per_cb);
-    CBHandle cb_c = MakeCircularBufferBFP16(program, core, tt::CB::c_out0, tiles_per_cb);
+    CBHandle cb_a = MakeCircularBufferBFP16(program, core, tt::CBIndex::c_0, tiles_per_cb);
+    CBHandle cb_b = MakeCircularBufferBFP16(program, core, tt::CBIndex::c_1, tiles_per_cb);
+    CBHandle cb_c = MakeCircularBufferBFP16(program, core, tt::CBIndex::c_16, tiles_per_cb);
 
     EnqueueWriteBuffer(cq, a, a_data, false);
     EnqueueWriteBuffer(cq, b, b_data, false);
diff --git a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
index e4a31d64676..a9b18d096d2 100644
--- a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
+++ b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp
@@ -94,16 +94,16 @@ int main(int argc, char **argv) {
          * Use circular buffers to set input and output buffers that the
          * compute engine will use.
          */
-        constexpr uint32_t src0_cb_index = CB::c_in0;
+        constexpr uint32_t src0_cb_index = tt::CBIndex::c_0;
         constexpr uint32_t num_input_tiles = 2;
         CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
         CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        constexpr uint32_t src1_cb_index = CB::c_in1;
+        constexpr uint32_t src1_cb_index = tt::CBIndex::c_1;
         CircularBufferConfig cb_src1_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src1_cb_index, single_tile_size);
         CBHandle cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-        constexpr uint32_t output_cb_index = CB::c_out0;
+        constexpr uint32_t output_cb_index = tt::CBIndex::c_16;
         constexpr uint32_t num_output_tiles = 2;
         CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
         CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
diff --git a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
index bc3e6593501..1896a75efc8 100644
--- a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
+++ b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp
@@ -61,12 +61,12 @@ int main(int argc, char **argv) {
          * Use circular buffers to set input and output buffers that the
          * compute engine will use.
          */
-        constexpr uint32_t src0_cb_index = CB::c_in0;
+        constexpr uint32_t src0_cb_index = tt::CBIndex::c_0;
         constexpr uint32_t num_input_tiles = 2;
         CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
         CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        constexpr uint32_t output_cb_index = CB::c_out0;
+        constexpr uint32_t output_cb_index = tt::CBIndex::c_16;
         constexpr uint32_t num_output_tiles = 2;
         CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
         CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
diff --git a/tt_metal/programming_examples/hello_world_datatypes_kernel/hello_world_datatypes_kernel.cpp b/tt_metal/programming_examples/hello_world_datatypes_kernel/hello_world_datatypes_kernel.cpp
index 037e7390e0f..c7263ed39a8 100644
--- a/tt_metal/programming_examples/hello_world_datatypes_kernel/hello_world_datatypes_kernel.cpp
+++ b/tt_metal/programming_examples/hello_world_datatypes_kernel/hello_world_datatypes_kernel.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
 
     // Configure and Create Circular Buffer (to move data from DRAM to L1)
 
-    constexpr uint32_t src0_cb_index = CB::c_in0;
+    constexpr uint32_t src0_cb_index = CBIndex::c_0;
     CircularBufferConfig cb_src0_config = CircularBufferConfig(buffer_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, buffer_size);
     CBHandle cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
diff --git a/tt_metal/programming_examples/hello_world_datatypes_kernel/kernels/dataflow/float_dataflow_kernel.cpp b/tt_metal/programming_examples/hello_world_datatypes_kernel/kernels/dataflow/float_dataflow_kernel.cpp
index 60348a9d931..d718af0f70e 100644
--- a/tt_metal/programming_examples/hello_world_datatypes_kernel/kernels/dataflow/float_dataflow_kernel.cpp
+++ b/tt_metal/programming_examples/hello_world_datatypes_kernel/kernels/dataflow/float_dataflow_kernel.cpp
@@ -13,7 +13,7 @@ void kernel_main() {
         // Copy float from device DRAM into Core 0,0's L1
         uint32_t dram_addr  = get_arg_val<uint32_t>(0);
         uint64_t noc_addr = get_noc_addr(1, 0, dram_addr);
-        constexpr uint32_t cb_id = tt::CB::c_in0; // index=0
+        constexpr uint32_t cb_id = tt::CBIndex::c_0; // index=0
         uint32_t size = get_tile_size(cb_id);
         uint32_t l1_addr= get_write_ptr(cb_id);
         cb_reserve_back(cb_id, 0);
diff --git a/tt_metal/programming_examples/matmul_common/kernels/compute/bmm.cpp b/tt_metal/programming_examples/matmul_common/kernels/compute/bmm.cpp
index d62a8e06e98..32bf8674741 100644
--- a/tt_metal/programming_examples/matmul_common/kernels/compute/bmm.cpp
+++ b/tt_metal/programming_examples/matmul_common/kernels/compute/bmm.cpp
@@ -33,18 +33,18 @@ void MAIN {
     {
         acquire_dst();
         for (uint32_t kt = 0; kt < Kt; kt++) {
-            cb_wait_front(tt::CB::c_in0, onetile);
-            cb_wait_front(tt::CB::c_in1, onetile);
+            cb_wait_front(tt::CBIndex::c_0, onetile);
+            cb_wait_front(tt::CBIndex::c_1, onetile);
 
-            matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
+            matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
 
-            cb_pop_front(tt::CB::c_in0, onetile);
-            cb_pop_front(tt::CB::c_in1, onetile);
+            cb_pop_front(tt::CBIndex::c_0, onetile);
+            cb_pop_front(tt::CBIndex::c_1, onetile);
         }
 
-        cb_reserve_back(tt::CB::c_out0, onetile);
-        pack_tile(0, tt::CB::c_out0);
-        cb_push_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
+        pack_tile(0, tt::CBIndex::c_16);
+        cb_push_back(tt::CBIndex::c_16, onetile);
 
         release_dst();
     }
diff --git a/tt_metal/programming_examples/matmul_common/kernels/compute/bmm_large_block_zm.cpp b/tt_metal/programming_examples/matmul_common/kernels/compute/bmm_large_block_zm.cpp
index 2ab808f2f32..58db4b37817 100644
--- a/tt_metal/programming_examples/matmul_common/kernels/compute/bmm_large_block_zm.cpp
+++ b/tt_metal/programming_examples/matmul_common/kernels/compute/bmm_large_block_zm.cpp
@@ -34,8 +34,8 @@ void MAIN {
         {
             bool last_out = block == (num_blocks-1);
 
-            cb_wait_front(tt::CB::c_in0, in0_block_num_tiles);
-            cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+            cb_wait_front(tt::CBIndex::c_0, in0_block_num_tiles);
+            cb_wait_front(tt::CBIndex::c_1, in1_block_num_tiles);
             int in0_index_subblock_offset = 0;
             for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
                 int in1_index_subblock_offset = 0;
@@ -45,11 +45,11 @@ void MAIN {
 
                     if (enable_reload) {
                         copy_tile_to_dst_init_short();
-                        cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            copy_tile(tt::CB::c_intermed0, i, i);
+                            copy_tile(tt::CBIndex::c_24, i, i);
                         }
-                        cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
                         mm_init_short();
                     }
 
@@ -62,7 +62,7 @@ void MAIN {
                             for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
                                 int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
                                 int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
-                                matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                                matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                                 in1_index_inner_dim_offset += in1_per_core_w;
                             }
                             dst_index++;
@@ -72,23 +72,23 @@ void MAIN {
 
                     if (last_out) {
                         // Pack out to output buffer
-                        cb_reserve_back(tt::CB::c_out0, out_subblock_num_tiles);
+                        cb_reserve_back(tt::CBIndex::c_16, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            pack_tile(i, tt::CB::c_out0);
+                            pack_tile(i, tt::CBIndex::c_16);
                         }
-                        cb_push_back(tt::CB::c_out0, out_subblock_num_tiles);
+                        cb_push_back(tt::CBIndex::c_16, out_subblock_num_tiles);
                     } else {
                         // Wait for tiles in output buffer to be written out since interm and output share memory
                         if (block == 0) {
-                            cb_reserve_back(tt::CB::c_out0, out_num_tiles_to_wait);
+                            cb_reserve_back(tt::CBIndex::c_16, out_num_tiles_to_wait);
                             out_num_tiles_to_wait += out_subblock_num_tiles;
                         }
                         // Move partial result to interm buffer
-                        cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            pack_tile(i, tt::CB::c_intermed0);
+                            pack_tile(i, tt::CBIndex::c_24);
                         }
-                        cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
                     }
 
                     release_dst();
@@ -99,8 +99,8 @@ void MAIN {
 
             if (spill) enable_reload = true;
 
-            cb_pop_front(tt::CB::c_in0, in0_block_num_tiles);
-            cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+            cb_pop_front(tt::CBIndex::c_0, in0_block_num_tiles);
+            cb_pop_front(tt::CBIndex::c_1, in1_block_num_tiles);
 
         }
     }
diff --git a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
index ab47971dbc6..b99b5a6247d 100644
--- a/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
+++ b/tt_metal/programming_examples/matmul_multi_core/matmul_multi_core.cpp
@@ -111,7 +111,7 @@ void matmul_multi_core(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std::
 
     tt_metal::InterleavedBufferConfig dram_config_C{
                     .device= device,
-                    .size = dram_buffer_B_size,
+                    .size = dram_buffer_C_size,
                     .page_size = single_tile_size,
                     .buffer_type = tt_metal::BufferType::DRAM
         };
@@ -127,18 +127,18 @@ void matmul_multi_core(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std::
     * Config of Circular Buffer in the device L1
     * input tiles count is = 2 because it's single tile process, and double-buffer
     */
-    uint32_t src0_cb_index = CB::c_in0; //0
+    uint32_t src0_cb_index = CBIndex::c_0; //0
     uint32_t num_input_tiles = 2;
     CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
 		.set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t src1_cb_index = CB::c_in1; // 1
+    uint32_t src1_cb_index = CBIndex::c_1; // 1
     CircularBufferConfig cb_src1_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
 		.set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
 		.set_page_size(output_cb_index, single_tile_size);
diff --git a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
index 41e3a1cd231..223c0d46779 100644
--- a/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
+++ b/tt_metal/programming_examples/matmul_multicore_reuse/matmul_multicore_reuse.cpp
@@ -179,7 +179,7 @@ void matmul_multicore_reuse(std::vector<bfloat16>& a, std::vector<bfloat16>& b,
 
     tt_metal::InterleavedBufferConfig dram_config_C{
                     .device= device,
-                    .size = dram_buffer_B_size,
+                    .size = dram_buffer_C_size,
                     .page_size = single_tile_size,
                     .buffer_type = tt_metal::BufferType::DRAM
         };
@@ -197,17 +197,17 @@ void matmul_multicore_reuse(std::vector<bfloat16>& a, std::vector<bfloat16>& b,
     * Config of Circular Buffer in the device L1
     * input tiles count is = 2 because it's single tile process, and double-buffer
     */
-    uint32_t src0_cb_index = CB::c_in0; //0
+    uint32_t src0_cb_index = CBIndex::c_0; //0
     CircularBufferConfig cb_src0_config = CircularBufferConfig(in0_CB_size, {{src0_cb_index, cb_data_format}})
 		.set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t src1_cb_index = CB::c_in1; // 1
+    uint32_t src1_cb_index = CBIndex::c_1; // 1
     CircularBufferConfig cb_src1_config = CircularBufferConfig(in1_CB_size, {{src1_cb_index, cb_data_format}})
 		.set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t interm0_cb_index = 24;
     std::map<uint8_t, tt::DataFormat> output_cb_data_format_spec {
         {output_cb_index, cb_data_format},
diff --git a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
index b6beb079bea..bafed8ae0e1 100644
--- a/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
+++ b/tt_metal/programming_examples/matmul_multicore_reuse_mcast/matmul_multicore_reuse_mcast.cpp
@@ -209,7 +209,7 @@ void matmul_multicore_reuse_mcast(std::vector<bfloat16>& a, std::vector<bfloat16
 
     tt_metal::InterleavedBufferConfig dram_config_C{
                     .device= device,
-                    .size = dram_buffer_B_size,
+                    .size = dram_buffer_C_size,
                     .page_size = single_tile_size,
                     .buffer_type = tt_metal::BufferType::DRAM
         };
@@ -225,17 +225,17 @@ void matmul_multicore_reuse_mcast(std::vector<bfloat16>& a, std::vector<bfloat16
     * Config of Circular Buffer in the device L1
     * input tiles count is = 2 because it's single tile process, and double-buffer
     */
-    uint32_t src0_cb_index = CB::c_in0; //0
+    uint32_t src0_cb_index = CBIndex::c_0; //0
     CircularBufferConfig cb_src0_config = CircularBufferConfig(in0_CB_size, {{src0_cb_index, cb_data_format}})
 		.set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t src1_cb_index = CB::c_in1; // 1
+    uint32_t src1_cb_index = CBIndex::c_1; // 1
     CircularBufferConfig cb_src1_config = CircularBufferConfig(in1_CB_size, {{src1_cb_index, cb_data_format}})
 		.set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t interm0_cb_index = 24;
     std::map<uint8_t, tt::DataFormat> output_cb_data_format_spec {
         {output_cb_index, cb_data_format},
@@ -460,9 +460,9 @@ int main(int argc, char **argv) {
         // NOTE: Maximum number of tiles in output is 120 * 16^2 = 30,720 (eg. [1, 1, 5120, 6144])
 
         /* Create source data */
-        constexpr uint32_t M = 3200;  // user-defined
-        constexpr uint32_t N = 3200;  // user-defined
-        constexpr uint32_t K = 3200;  // user-defined
+        constexpr uint32_t M = 3584;  // user-defined
+        constexpr uint32_t N = 3072;  // user-defined
+        constexpr uint32_t K = 768;  // user-defined
         constexpr uint32_t B = 1;  // user-defined
 
         uint32_t Mt = M / TILE_HEIGHT;
diff --git a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
index 6e95757ba1e..b355ad04927 100644
--- a/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
+++ b/tt_metal/programming_examples/matmul_single_core/matmul_single_core.cpp
@@ -96,7 +96,7 @@ void matmul_single_core(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std:
 
     tt_metal::InterleavedBufferConfig dram_config_C{
                     .device= device,
-                    .size = dram_buffer_B_size,
+                    .size = dram_buffer_C_size,
                     .page_size = single_tile_size,
                     .buffer_type = tt_metal::BufferType::DRAM
         };
@@ -112,18 +112,18 @@ void matmul_single_core(std::vector<bfloat16>& a, std::vector<bfloat16>& b, std:
     * Config of Circular Buffer in the device L1
     * input tiles count is = 2 because it's single tile process, and double-buffer
     */
-    uint32_t src0_cb_index = CB::c_in0; //0
+    uint32_t src0_cb_index = CBIndex::c_0; //0
     uint32_t num_input_tiles = 2;
     CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
 		.set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t src1_cb_index = CB::c_in1; // 1
+    uint32_t src1_cb_index = CBIndex::c_1; // 1
     CircularBufferConfig cb_src1_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
 		.set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
 		.set_page_size(output_cb_index, single_tile_size);
diff --git a/tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp b/tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp
index 79a591d4dba..77e26a3f18e 100644
--- a/tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp
+++ b/tt_metal/programming_examples/pad/kernels/pad_reader_dims_rm_interleaved.cpp
@@ -17,7 +17,7 @@ void kernel_main() {
 
     constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1;
     constexpr bool pad_is_dram = get_compile_time_arg_val(1) == 1;
-    constexpr uint32_t cb_id = tt::CB::c_in0;
+    constexpr uint32_t cb_id = tt::CBIndex::c_0;
 
     const InterleavedAddrGen<src_is_dram> s0 = {
         .bank_base_address = src_addr,
diff --git a/tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp b/tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp
index 5f5a5e080c3..6d041dd22e8 100644
--- a/tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp
+++ b/tt_metal/programming_examples/pad/kernels/pad_writer_dims_rm_interleaved.cpp
@@ -15,7 +15,7 @@ void kernel_main() {
     const uint32_t num_rows_per_core = get_arg_val<uint32_t>(4);
 
     constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
-    constexpr uint32_t cb_id = tt::CB::c_in0;
+    constexpr uint32_t cb_id = tt::CBIndex::c_0;
 
     const InterleavedAddrGen<dst_is_dram> s0 = {
         .bank_base_address = dst_addr,
diff --git a/tt_metal/programming_examples/pad/pad_multi_core.cpp b/tt_metal/programming_examples/pad/pad_multi_core.cpp
index 74aaf317141..44d8f483cd0 100644
--- a/tt_metal/programming_examples/pad/pad_multi_core.cpp
+++ b/tt_metal/programming_examples/pad/pad_multi_core.cpp
@@ -85,7 +85,7 @@ int main(int argc, char **argv) {
     uint32_t dst_addr = dst_buffer->address();
 
     // configure and create circular buffer
-    uint32_t cb_id = CB::c_in0;
+    uint32_t cb_id = CBIndex::c_0;
     tt::DataFormat cb_data_format = tt::DataFormat::UInt32;
     CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(dst_N * packed_data_size * 2, {{cb_id, cb_data_format}})
 		.set_page_size(cb_id, packed_data_size);
diff --git a/tt_metal/programming_examples/sharding/shard_data_rm.cpp b/tt_metal/programming_examples/sharding/shard_data_rm.cpp
index 7528ce1d2ba..0c4c5c25300 100644
--- a/tt_metal/programming_examples/sharding/shard_data_rm.cpp
+++ b/tt_metal/programming_examples/sharding/shard_data_rm.cpp
@@ -60,7 +60,7 @@ int main(int argc, char **argv) {
 
     // configure and create circular buffers with the same address on each of the designated cores
     bool src_is_dram = src_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
-    uint32_t input_cb_index = CB::c_in0;
+    uint32_t input_cb_index = CBIndex::c_0;
     CircularBufferConfig input_cb_config = CircularBufferConfig(shard_size * input_unit_size, {{input_cb_index, cb_data_format}})
 		.set_page_size(input_cb_index, input_unit_size);
     auto cb_input = tt_metal::CreateCircularBuffer(program, cores, input_cb_config);
diff --git a/tt_metal/third_party/umd b/tt_metal/third_party/umd
index e9dc0d12fc2..8985708a924 160000
--- a/tt_metal/third_party/umd
+++ b/tt_metal/third_party/umd
@@ -1 +1 @@
-Subproject commit e9dc0d12fc24aba9196b05e6ba142be475447694
+Subproject commit 8985708a924ee48f202f79f31e05fc15a25a7a44
diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
index 1bf43035a8b..6afcf05d46a 100644
--- a/ttnn/CMakeLists.txt
+++ b/ttnn/CMakeLists.txt
@@ -321,11 +321,11 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/softmax/softmax_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/normalization/softmax/device/softmax_op.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/avgpool/avg_pool.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/downsample/downsample.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/downsample/downsample_pybind.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_device_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/pool/maxpool/max_pool2d.cpp
@@ -400,6 +400,10 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/bernoulli_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/device/bernoulli_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_program_factory.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow_pybind.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_adam/moreh_adam_pybind.cpp
@@ -499,9 +503,9 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm_backward/moreh_norm_backward_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm_backward/moreh_norm_backward.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_h.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_other.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_w.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_h_other.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_nc_other.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_w_other.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/moreh/moreh_pybind.cpp
diff --git a/ttnn/cpp/pybind11/device.cpp b/ttnn/cpp/pybind11/device.cpp
index e6608073895..c866b12bcd3 100644
--- a/ttnn/cpp/pybind11/device.cpp
+++ b/ttnn/cpp/pybind11/device.cpp
@@ -15,6 +15,8 @@
 #include "tt_metal/impl/trace/trace.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 
+using namespace tt::tt_metal;
+
 namespace py = pybind11;
 
 namespace {
diff --git a/ttnn/cpp/pybind11/events.cpp b/ttnn/cpp/pybind11/events.cpp
index 59155a79d6a..fdb12668f63 100644
--- a/ttnn/cpp/pybind11/events.cpp
+++ b/ttnn/cpp/pybind11/events.cpp
@@ -7,6 +7,8 @@
 #include "tt_metal/impl/event/event.hpp"
 #include "pybind11/pybind11.h"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::events {
 
 void py_module_types(py::module& module) {
diff --git a/ttnn/cpp/pybind11/operations/__init__.hpp b/ttnn/cpp/pybind11/operations/__init__.hpp
index a7572cc45f4..c879b4e73b5 100644
--- a/ttnn/cpp/pybind11/operations/__init__.hpp
+++ b/ttnn/cpp/pybind11/operations/__init__.hpp
@@ -37,8 +37,8 @@
 #include "ttnn/operations/matmul/matmul_pybind.hpp"
 #include "ttnn/operations/moreh/moreh_pybind.hpp"
 #include "ttnn/operations/normalization/normalization_pybind.hpp"
-#include "ttnn/operations/pool/avgpool/avg_pool_pybind.hpp"
 #include "ttnn/operations/pool/downsample/downsample_pybind.hpp"
+#include "ttnn/operations/pool/global_avg_pool/global_avg_pool_pybind.hpp"
 #include "ttnn/operations/pool/maxpool/max_pool2d_pybind.hpp"
 #include "ttnn/operations/pool/upsample/upsample_pybind.hpp"
 #include "ttnn/operations/reduction/reduction_pybind.hpp"
diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp
index 2e526cc0556..f7827813222 100644
--- a/ttnn/cpp/pybind11/pytensor.cpp
+++ b/ttnn/cpp/pybind11/pytensor.cpp
@@ -19,6 +19,8 @@
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/tensor_impl.hpp"
 
+using namespace tt::tt_metal;
+
 namespace py = pybind11;
 
 namespace ttnn::tensor {
@@ -76,97 +78,18 @@ Tensor create_owned_tensor(
     return Tensor(std::move(storage), shape, data_type, layout, optional_tile);
 }
 
-Tensor convert_torch_tensor_to_tt_tensor(
-    const py::handle &torch_tensor,
-    std::optional<DataType> optional_data_type = std::nullopt,
-    const std::optional<Tile> &optional_tile = std::nullopt,
-    bool enable_borrow = true) {
-    py::object torch = py::module_::import("torch");
-    if (not py::isinstance(torch_tensor, torch.attr("Tensor"))) {
-        TT_THROW("The argument must be of type torch.Tensor!");
-    }
-
-    auto torch_dtype = torch_tensor.attr("dtype");
-    auto shape = py::cast<ttnn::SmallVector<uint32_t>>(torch_tensor.attr("shape"));
-
-    auto contiguous_torch_tensor = torch_tensor.attr("contiguous")();
-
-    // Override the data type if there is an user-provided one
-    // Otherwise, figure it out from torch dtype
-    DataType data_type;
-    if (optional_data_type.has_value()) {
-        data_type = optional_data_type.value();
-    } else if (torch_dtype.equal(torch.attr("float32"))) {
-        data_type = DataType::FLOAT32;
-    } else if (torch_dtype.equal(torch.attr("float16"))) {
-        contiguous_torch_tensor = contiguous_torch_tensor.attr("to")(torch.attr("bfloat16"));
-        // TODO(arakhmati): add DataType::FLOAT16?
-        data_type = DataType::BFLOAT16;
-    } else if (torch_dtype.equal(torch.attr("bfloat16"))) {
-        data_type = DataType::BFLOAT16;
-    } else if (torch_dtype.equal(torch.attr("int64"))) {
-        contiguous_torch_tensor = contiguous_torch_tensor.attr("to")(torch.attr("int32"));
-        // TODO(arakhmati): add DataType::INT64?
-        data_type = DataType::UINT32;
-    } else if (torch_dtype.equal(torch.attr("int32"))) {
-        data_type = DataType::INT32;
-    } else if (torch_dtype.equal(torch.attr("int16"))) {
-        // TODO(arakhmati): add DataType::INT16?
-        data_type = DataType::UINT16;
-    } else if (torch_dtype.equal(torch.attr("uint8"))) {
-        data_type = DataType::UINT8;
-    } else {
-        TT_THROW("Unsupported DataType: {}", std::string(py::repr(torch_dtype)));
-    }
-
+Tensor create_tt_tensor_from_py_data(
+    std::size_t num_elements,
+    std::size_t py_data_ptr,
+    const ttnn::SmallVector<uint32_t> &shape,
+    const DataType data_type,
+    const std::optional<Tile> &optional_tile,
+    bool enable_borrow,
+    std::function<void()> on_creation_callback = [] {},
+    std::function<void()> on_destruction_callback = [] {}) {
     switch (data_type) {
         case DataType::UINT8: {
-            if (not torch_dtype.equal(torch.attr("uint8"))) {
-                contiguous_torch_tensor = contiguous_torch_tensor.attr("to")(torch.attr("uint8"));
-            }
-            break;
-        }
-        case DataType::UINT16: {
-            if (not torch_dtype.equal(torch.attr("int16"))) {
-                contiguous_torch_tensor = contiguous_torch_tensor.attr("to")(torch.attr("int16"));
-            }
-            break;
-        }
-        case DataType::INT32:
-        case DataType::UINT32: {
-            if (not torch_dtype.equal(torch.attr("int32"))) {
-                contiguous_torch_tensor = contiguous_torch_tensor.attr("to")(torch.attr("int32"));
-            }
-            break;
-        }
-        case DataType::BFLOAT4_B:
-        case DataType::BFLOAT8_B:
-        case DataType::FLOAT32: {
-            if (not torch_dtype.equal(torch.attr("float32"))) {
-                contiguous_torch_tensor = contiguous_torch_tensor.attr("to")(torch.attr("float32"));
-            }
-            break;
-        }
-        case DataType::BFLOAT16: {
-            if (not torch_dtype.equal(torch.attr("bfloat16"))) {
-                contiguous_torch_tensor = contiguous_torch_tensor.attr("to")(torch.attr("bfloat16"));
-            }
-            break;
-        }
-        default: {
-            TT_THROW("Unsupported DataType: {}", data_type);
-            break;
-        }
-    }
-
-    auto on_creation_callback = [tensor = contiguous_torch_tensor] { tensor.inc_ref(); };
-    auto on_destruction_callback = [tensor = contiguous_torch_tensor] { tensor.dec_ref(); };
-
-    auto num_elements = py::cast<std::size_t>(contiguous_torch_tensor.attr("numel")());
-    auto torch_data_ptr = py::cast<std::size_t>(contiguous_torch_tensor.attr("data_ptr")());
-    switch (data_type) {
-        case DataType::UINT8: {
-            auto data_ptr = reinterpret_cast<uint8_t *>(torch_data_ptr);
+            auto data_ptr = reinterpret_cast<uint8_t *>(py_data_ptr);
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
@@ -176,7 +99,7 @@ Tensor convert_torch_tensor_to_tt_tensor(
             }
         }
         case DataType::UINT16: {
-            auto data_ptr = reinterpret_cast<uint16_t *>(torch_data_ptr);
+            auto data_ptr = reinterpret_cast<uint16_t *>(py_data_ptr);
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
@@ -186,7 +109,7 @@ Tensor convert_torch_tensor_to_tt_tensor(
             }
         }
         case DataType::INT32: {
-            auto data_ptr = reinterpret_cast<int32_t *>(torch_data_ptr);
+            auto data_ptr = reinterpret_cast<int32_t *>(py_data_ptr);
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
@@ -196,7 +119,7 @@ Tensor convert_torch_tensor_to_tt_tensor(
             }
         }
         case DataType::UINT32: {
-            auto data_ptr = reinterpret_cast<uint32_t *>(torch_data_ptr);
+            auto data_ptr = reinterpret_cast<uint32_t *>(py_data_ptr);
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
@@ -206,7 +129,7 @@ Tensor convert_torch_tensor_to_tt_tensor(
             }
         }
         case DataType::FLOAT32: {
-            auto data_ptr = reinterpret_cast<float *>(torch_data_ptr);
+            auto data_ptr = reinterpret_cast<float *>(py_data_ptr);
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
@@ -215,8 +138,9 @@ Tensor convert_torch_tensor_to_tt_tensor(
                 return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile);
             }
         }
+        // TODO: This is not supported for numpy
         case DataType::BFLOAT16: {
-            auto data_ptr = reinterpret_cast<::bfloat16 *>(torch_data_ptr);
+            auto data_ptr = reinterpret_cast<::bfloat16 *>(py_data_ptr);
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
@@ -226,7 +150,7 @@ Tensor convert_torch_tensor_to_tt_tensor(
             }
         }
         case DataType::BFLOAT8_B: {
-            auto data_ptr = reinterpret_cast<float *>(torch_data_ptr);
+            auto data_ptr = reinterpret_cast<float *>(py_data_ptr);
             auto data = std::vector<float>(data_ptr, data_ptr + num_elements);
             auto buffer = owned_buffer::create<float>(std::move(data));
             auto tile = optional_tile.value_or(Tile());
@@ -240,7 +164,7 @@ Tensor convert_torch_tensor_to_tt_tensor(
                 std::move(OwnedStorage{std::move(output_buffer)}), shape, data_type, Layout::TILE, tile);
         }
         case DataType::BFLOAT4_B: {
-            auto data_ptr = reinterpret_cast<float *>(torch_data_ptr);
+            auto data_ptr = reinterpret_cast<float *>(py_data_ptr);
             auto data = std::vector<float>(data_ptr, data_ptr + num_elements);
             auto buffer = owned_buffer::create<float>(std::move(data));
             auto tile = optional_tile.value_or(Tile());
@@ -260,187 +184,166 @@ Tensor convert_torch_tensor_to_tt_tensor(
     }
 }
 
-Tensor convert_numpy_tensor_to_tt_tensor(
-    const py::handle &np_tensor,
+Tensor convert_python_tensor_to_tt_tensor(
+    const py::handle &py_tensor,
     std::optional<DataType> optional_data_type = std::nullopt,
-    const std::optional<Tile> &optional_tile = std::nullopt) {
+    const std::optional<Tile> &optional_tile = std::nullopt,
+    bool enable_borrow = true) {
+    GraphTracker::instance().track_function_start(
+        "tt::tt_metal::detail::convert_python_tensor_to_tt_tensor", py_tensor, optional_data_type, enable_borrow);
+    py::object torch = py::module_::import("torch");
     py::object np = py::module_::import("numpy");
-    if (not py::isinstance(np_tensor, np.attr("ndarray"))) {
-        TT_THROW("The tensor must be of type numpy.ndarray!");
-    }
-
-    auto np_dtype = np_tensor.attr("dtype");
-    auto shape = py::cast<ttnn::SmallVector<uint32_t>>(np_tensor.attr("shape"));
 
-    auto contiguous_np_tensor = np.attr("ascontiguousarray")(np_tensor);
+    auto py_dtype = py_tensor.attr("dtype");
+    auto shape = py::cast<ttnn::SmallVector<uint32_t>>(py_tensor.attr("shape"));
 
-    // Override the data type if there is an user-provided one
-    // Otherwise, figure it out from numpy dtype
     DataType data_type;
-    if (optional_data_type.has_value()) {
-        data_type = optional_data_type.value();
-    } else if (np_dtype.equal(np.attr("float32"))) {
-        data_type = DataType::FLOAT32;
-    } else if (np_dtype.equal(np.attr("float16"))) {
-        contiguous_np_tensor = contiguous_np_tensor.attr("astype")(np.attr("float32"));
-        // TODO(arakhmati): add DataType::FLOAT16?
-        data_type = DataType::BFLOAT16;
-    } else if (np_dtype.equal(np.attr("int64"))) {
-        contiguous_np_tensor = contiguous_np_tensor.attr("astype")(np.attr("int32"));
-        // TODO(arakhmati): add DataType::INT64?
-        data_type = DataType::UINT32;
-    } else if (np_dtype.equal(np.attr("int32"))) {
-        // TODO(arakhmati): add DataType::INT32?
-        data_type = DataType::UINT32;
-    } else if (np_dtype.equal(np.attr("ubyte"))) {
-        data_type = DataType::UINT8;
-    } else {
-        TT_THROW("Unsupported DataType: {}", std::string(py::repr(np_dtype)));
-    }
 
-    switch (data_type) {
-        case DataType::UINT8: {
-            if (not np_dtype.equal(np.attr("ubyte"))) {
-                contiguous_np_tensor = contiguous_np_tensor.attr("astype")(np.attr("ubyte"));
-            }
-            break;
+    py::object contiguous_py_tensor;
+    size_t num_elements = 0;
+    size_t py_data_ptr = 0;
+    if (py::isinstance(py_tensor, torch.attr("Tensor"))) {
+        contiguous_py_tensor = py_tensor.attr("contiguous")();
+
+        // Override the data type if there is a user-provided one
+        // Otherwise, figure it out from torch dtype
+        if (optional_data_type.has_value()) {
+            data_type = optional_data_type.value();
+        } else if (py_dtype.equal(torch.attr("float32"))) {
+            data_type = DataType::FLOAT32;
+        } else if (py_dtype.equal(torch.attr("float16"))) {
+            data_type = DataType::BFLOAT16;
+        } else if (py_dtype.equal(torch.attr("bfloat16"))) {
+            data_type = DataType::BFLOAT16;
+        } else if (py_dtype.equal(torch.attr("int64"))) {
+            // TODO: add DataType::INT64?
+            data_type = DataType::UINT32;
+        } else if (py_dtype.equal(torch.attr("int32"))) {
+            data_type = DataType::INT32;
+        } else if (py_dtype.equal(torch.attr("int16"))) {
+            // TODO: add DataType::INT16?
+            data_type = DataType::UINT16;
+        } else if (py_dtype.equal(torch.attr("uint8"))) {
+            data_type = DataType::UINT8;
+        } else {
+            TT_THROW("Unsupported DataType: {}", std::string(py::repr(py_dtype)));
         }
-        case DataType::UINT16: {
-            if (not np_dtype.equal(np.attr("int32"))) {
-                contiguous_np_tensor = contiguous_np_tensor.attr("astype")(np.attr("int16"));
+
+        auto maybe_convert_pytorch_tensor = [&contiguous_py_tensor, &py_dtype, &torch](const char *target_py_dtype) {
+            if (not py_dtype.equal(torch.attr(target_py_dtype))) {
+                contiguous_py_tensor = contiguous_py_tensor.attr("to")(torch.attr(target_py_dtype));
             }
-            break;
-        }
-        case DataType::INT32:
-        case DataType::UINT32: {
-            if (not np_dtype.equal(np.attr("int32"))) {
-                contiguous_np_tensor = contiguous_np_tensor.attr("astype")(np.attr("int32"));
+        };
+        switch (data_type) {
+            case DataType::UINT8: {
+                maybe_convert_pytorch_tensor("uint8");
+                break;
             }
-            break;
-        }
-        case DataType::BFLOAT4_B:
-        case DataType::BFLOAT8_B:
-        case DataType::FLOAT32: {
-            if (not np_dtype.equal(np.attr("float32"))) {
-                contiguous_np_tensor = contiguous_np_tensor.attr("astype")(np.attr("float32"));
+            case DataType::UINT16: {
+                maybe_convert_pytorch_tensor("int16");
+                break;
             }
-            break;
-        }
-        /*
-        case DataType::BFLOAT16: {
-            if (not np_dtype.equal(np.attr("bfloat16"))) {
-                contiguous_np_tensor = contiguous_np_tensor.attr("to")(np.attr("bfloat16"));
+            case DataType::INT32:
+            case DataType::UINT32: {
+                maybe_convert_pytorch_tensor("int32");
+                break;
+            }
+            case DataType::BFLOAT4_B:
+            case DataType::BFLOAT8_B:
+            case DataType::FLOAT32: {
+                maybe_convert_pytorch_tensor("float32");
+                break;
+            }
+            case DataType::BFLOAT16: {
+                maybe_convert_pytorch_tensor("bfloat16");
+                break;
+            }
+            default: {
+                TT_THROW("Unsupported DataType: {}", data_type);
+                break;
             }
-            break;
-        }
-        */
-        default: {
-            TT_THROW("Unsupported DataType: {}", data_type);
-            break;
         }
-    }
-
-    auto on_creation_callback = [tensor = contiguous_np_tensor] { tensor.inc_ref(); };
-    auto on_destruction_callback = [tensor = contiguous_np_tensor] { tensor.dec_ref(); };
 
-    auto num_elements = py::cast<std::size_t>(contiguous_np_tensor.attr("size"));
-    auto np_data_ptr = py::cast<std::size_t>(
-        py::cast<py::tuple>(py::cast<py::dict>(contiguous_np_tensor.attr("__array_interface__"))[py::str("data")])[0]);
-
-    switch (data_type) {
-        case DataType::UINT8: {
-            auto data_ptr = reinterpret_cast<uint8_t *>(np_data_ptr);
-            auto storage = BorrowedStorage(
-                borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-            return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
-        }
-        case DataType::UINT16: {
-            auto data_ptr = reinterpret_cast<uint16_t *>(np_data_ptr);
-            auto storage = BorrowedStorage(
-                borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-            return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
-        }
-        case DataType::INT32: {
-            auto data_ptr = reinterpret_cast<int32_t *>(np_data_ptr);
-            auto storage = BorrowedStorage(
-                borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-            return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
-        }
-        case DataType::UINT32: {
-            auto data_ptr = reinterpret_cast<uint32_t *>(np_data_ptr);
-            auto storage = BorrowedStorage(
-                borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-            return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
-        }
-        case DataType::FLOAT32: {
-            auto data_ptr = reinterpret_cast<float *>(np_data_ptr);
-            auto storage = BorrowedStorage(
-                borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-            return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
-        }
-        /*
-        case DataType::BFLOAT16: {
-            auto data_ptr = reinterpret_cast<bfloat16 *>(np_data_ptr);
-            auto storage = BorrowedStorage(
-                borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-            return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR);
-        }
-        */
-        case DataType::BFLOAT8_B: {
-            auto data_ptr = reinterpret_cast<float *>(np_data_ptr);
-            auto data = std::vector<float>(data_ptr, data_ptr + num_elements);
-            auto buffer = owned_buffer::create<float>(std::move(data));
-            auto tile = optional_tile.value_or(Tile());
-            auto tensor = Tensor(OwnedStorage{buffer}, shape, DataType::FLOAT32, Layout::ROW_MAJOR, optional_tile)
-                              .to(Layout::TILE);
-            auto output_float_data = owned_buffer::get_as<float>(tensor).get();
-            auto output_packed_data = pack_fp32_vec_as_bfp8_tiles(
-                output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false, tile);
-            auto output_buffer = owned_buffer::create<uint32_t>(std::move(output_packed_data));
-            return Tensor(std::move(OwnedStorage{std::move(output_buffer)}), shape, data_type, Layout::TILE, tile);
-        }
-        case DataType::BFLOAT4_B: {
-            auto data_ptr = reinterpret_cast<float *>(np_data_ptr);
-            auto data = std::vector<float>(data_ptr, data_ptr + num_elements);
-            auto buffer = owned_buffer::create<float>(std::move(data));
-            auto tile = optional_tile.value_or(Tile());
-            auto tensor = Tensor(OwnedStorage{buffer}, shape, DataType::FLOAT32, Layout::ROW_MAJOR, optional_tile)
-                              .to(Layout::TILE);
-            auto output_float_data = owned_buffer::get_as<float>(tensor).get();
-            auto output_packed_data = pack_fp32_vec_as_bfp4_tiles(
-                output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false, tile);
-            auto output_buffer = owned_buffer::create<uint32_t>(std::move(output_packed_data));
-            return Tensor(std::move(OwnedStorage{std::move(output_buffer)}), shape, data_type, Layout::TILE, tile);
+        num_elements = py::cast<std::size_t>(contiguous_py_tensor.attr("numel")());
+        py_data_ptr = py::cast<std::size_t>(contiguous_py_tensor.attr("data_ptr")());
+    } else if (py::isinstance(py_tensor, np.attr("ndarray"))) {
+        TT_FATAL(enable_borrow, "Owned storage for numpy tensors is untested!");
+
+        contiguous_py_tensor = np.attr("ascontiguousarray")(py_tensor);
+
+        // Override the data type if there is a user-provided one
+        // Otherwise, figure it out from numpy dtype
+        if (optional_data_type.has_value()) {
+            data_type = optional_data_type.value();
+        } else if (py_dtype.equal(np.attr("float32"))) {
+            data_type = DataType::FLOAT32;
+        } else if (py_dtype.equal(np.attr("int64"))) {
+            // TODO: add DataType::INT64?
+            data_type = DataType::UINT32;
+        // TODO: add np.float16 support?
+        } else if (py_dtype.equal(np.attr("int32"))) {
+            data_type = DataType::INT32;
+        } else if (py_dtype.equal(np.attr("int16"))) {
+            // TODO: add DataType::INT16?
+            data_type = DataType::UINT16;
+        } else if (py_dtype.equal(np.attr("ubyte"))) {
+            data_type = DataType::UINT8;
+        } else {
+            TT_THROW("Unsupported DataType: {}", std::string(py::repr(py_dtype)));
         }
-        default: {
-            TT_THROW("Unsupported DataType: {}", data_type);
-            break;
+
+        auto maybe_convert_numpy_tensor = [&contiguous_py_tensor, &py_dtype, &np](const char *target_py_dtype) {
+            if (not py_dtype.equal(np.attr(target_py_dtype))) {
+                contiguous_py_tensor = contiguous_py_tensor.attr("astype")(np.attr(target_py_dtype));
+            }
+        };
+        switch (data_type) {
+            case DataType::UINT8: {
+                maybe_convert_numpy_tensor("ubyte");
+                break;
+            }
+            case DataType::UINT16: {
+                maybe_convert_numpy_tensor("int16");
+                break;
+            }
+            case DataType::INT32:
+            case DataType::UINT32: {
+                maybe_convert_numpy_tensor("int32");
+                break;
+            }
+            case DataType::BFLOAT4_B:
+            case DataType::BFLOAT8_B:
+            case DataType::FLOAT32: {
+                maybe_convert_numpy_tensor("float32");
+                break;
+            }
+            default: {
+                TT_THROW("Unsupported DataType: {}", data_type);
+                break;
+            }
         }
-    }
-}
 
-Tensor convert_python_tensor_to_tt_tensor(
-    const py::handle &tensor,
-    std::optional<DataType> optional_data_type = std::nullopt,
-    const std::optional<Tile> &optional_tile = std::nullopt,
-    bool enable_borrow = true) {
-    GraphTracker::instance().track_function_start(
-        "tt::tt_metal::detail::convert_python_tensor_to_tt_tensor", tensor, optional_data_type, enable_borrow);
-    py::object torch = py::module_::import("torch");
-    py::object np = py::module_::import("numpy");
-    if (py::isinstance(tensor, torch.attr("Tensor"))) {
-        auto output = convert_torch_tensor_to_tt_tensor(tensor, optional_data_type, optional_tile, enable_borrow);
-        output = tt::tt_metal::set_tensor_id(output);
-        GraphTracker::instance().track_function_end(output);
-        return output;
-    } else if (py::isinstance(tensor, np.attr("ndarray"))) {
-        auto output = convert_numpy_tensor_to_tt_tensor(tensor, optional_data_type, optional_tile);
-        output = tt::tt_metal::set_tensor_id(output);
-        GraphTracker::instance().track_function_end(output);
-        return output;
+        num_elements = py::cast<std::size_t>(contiguous_py_tensor.attr("size"));
+        py_data_ptr = py::cast<std::size_t>(py::cast<py::tuple>(
+            py::cast<py::dict>(contiguous_py_tensor.attr("__array_interface__"))[py::str("data")])[0]);
     } else {
         TT_THROW("The argument must be of type torch.Tensor or numpy.ndarray!");
     }
+
+    auto on_creation_callback = [tensor = contiguous_py_tensor] { tensor.inc_ref(); };
+    auto on_destruction_callback = [tensor = contiguous_py_tensor] { tensor.dec_ref(); };
+    auto output = create_tt_tensor_from_py_data(
+        num_elements,
+        py_data_ptr,
+        shape,
+        data_type,
+        optional_tile,
+        enable_borrow,
+        on_creation_callback,
+        on_destruction_callback);
+    output = tt::tt_metal::set_tensor_id(output);
+    GraphTracker::instance().track_function_end(output);
+    return output;
 }
 
 Tensor convert_python_tensors_to_tt_tensors(
@@ -499,13 +402,9 @@ OwnedBuffer create_owned_buffer_from_vector_of_floats(std::vector<float> &&data,
     }
 }
 
-py::object convert_tt_tensor_to_torch_tensor(const Tensor &tt_tensor) {
-    GraphTracker::instance().track_function_start("tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor", tt_tensor);
+std::pair<std::variant<OwnedBuffer, BorrowedBuffer>, DataType> get_buffer_and_dtype_from_tensor(const Tensor &tt_tensor) {
     TT_ASSERT(tt_tensor.storage_type() == StorageType::OWNED or tt_tensor.storage_type() == StorageType::BORROWED);
 
-    using namespace pybind11::literals;
-    py::object torch = py::module_::import("torch");
-    auto frombuffer = torch.attr("frombuffer");
     auto buffer = std::visit(
         [](auto &&storage) -> std::variant<OwnedBuffer, BorrowedBuffer> {
             using T = std::decay_t<decltype(storage)>;
@@ -570,6 +469,17 @@ py::object convert_tt_tensor_to_torch_tensor(const Tensor &tt_tensor) {
         tt_dtype = DataType::FLOAT32;
     }
 
+    return {buffer, tt_dtype};
+}
+
+py::object convert_tt_tensor_to_torch_tensor(const Tensor &tt_tensor) {
+    GraphTracker::instance().track_function_start("tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor", tt_tensor);
+
+    auto [buffer, buffer_dtype] = get_buffer_and_dtype_from_tensor(tt_tensor);
+
+    py::object torch = py::module_::import("torch");
+    auto frombuffer = torch.attr("frombuffer");
+
     const auto tt_dtype_to_torch_dtype = std::map<DataType, py::object>{
         {DataType::UINT8, torch.attr("uint8")},
         {DataType::UINT16, torch.attr("int16")},  // TODO(arakhmati): add DataType::INT16
@@ -578,7 +488,7 @@ py::object convert_tt_tensor_to_torch_tensor(const Tensor &tt_tensor) {
         {DataType::FLOAT32, torch.attr("float32")},
         {DataType::BFLOAT16, torch.attr("bfloat16")},
     };
-    auto torch_dtype = tt_dtype_to_torch_dtype.at(tt_dtype);
+    auto torch_dtype = tt_dtype_to_torch_dtype.at(buffer_dtype);
 
     auto shape = tt_tensor.get_legacy_shape();
     auto torch_shape = std::vector<std::uint32_t>(std::begin(shape), std::end(shape));
@@ -588,9 +498,9 @@ py::object convert_tt_tensor_to_torch_tensor(const Tensor &tt_tensor) {
             auto logical_shape = tt_tensor.get_logical_shape();
             auto view = logical_shape.view();
             std::vector<uint32_t> shape_vector(view.begin(), view.end());
-            return pytorch_empty(shape_vector, "dtype"_a = torch_dtype);
+            return pytorch_empty(shape_vector, py::arg("dtype") = torch_dtype);
         }
-        return frombuffer(buffer, "dtype"_a = torch_dtype);
+        return frombuffer(buffer, py::arg("dtype") = torch_dtype);
     }();
     tensor = tensor.attr("reshape")(torch_shape);
     tensor = tensor.attr("contiguous")();
@@ -602,75 +512,13 @@ py::object convert_tt_tensor_to_torch_tensor(const Tensor &tt_tensor) {
 }
 
 py::object convert_tt_tensor_to_numpy_tensor(const Tensor &tt_tensor) {
-    GraphTracker::instance().track_function_start("tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor", tt_tensor);
-    TT_ASSERT(tt_tensor.storage_type() == StorageType::OWNED or tt_tensor.storage_type() == StorageType::BORROWED);
+    GraphTracker::instance().track_function_start("tt::tt_metal::detail::convert_tt_tensor_to_numpy_tensor", tt_tensor);
+
+    auto [buffer, buffer_dtype] = get_buffer_and_dtype_from_tensor(tt_tensor);
 
-    using namespace pybind11::literals;
     py::object np = py::module_::import("numpy");
     auto frombuffer = np.attr("frombuffer");
 
-    auto buffer = std::visit(
-        [](auto &&storage) -> std::variant<OwnedBuffer, BorrowedBuffer> {
-            using T = std::decay_t<decltype(storage)>;
-            if constexpr (std::is_same_v<T, OwnedStorage>) {
-                return storage.buffer;
-            } else if constexpr (std::is_same_v<T, DeviceStorage>) {
-                TT_THROW("Device tensor cannot be converted to numpy");
-            } else if constexpr (std::is_same_v<T, BorrowedStorage>) {
-                return storage.buffer;
-            } else if constexpr (std::is_same_v<T, MultiDeviceStorage>) {
-                TT_THROW("Device tensor cannot be converted to numpy");
-            } else if constexpr (std::is_same_v<T, MultiDeviceHostStorage>) {
-                TT_THROW("Device tensor cannot be converted to torch");
-            } else {
-                raise_unsupported_storage<T>();
-            }
-        },
-        tt_tensor.get_storage());
-
-    const auto tile = tt_tensor.get_tensor_spec().tile();
-    auto tt_dtype = tt_tensor.get_dtype();
-    if (tt_dtype == DataType::BFLOAT8_B) {
-        TT_ASSERT(
-            std::holds_alternative<OwnedBuffer>(buffer),
-            "Unexpected type {}",
-            tt::stl::get_active_type_name_in_variant(buffer));
-        auto uint32_data = std::get<owned_buffer::Buffer<std::uint32_t>>(std::get<OwnedBuffer>(buffer)).get();
-        auto float_unpacked_data =
-            unpack_bfp8_tiles_into_float_vec(uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile);
-        auto input_float_buffer = owned_buffer::create<float>(std::move(float_unpacked_data));
-        auto float_tensor = Tensor(
-                                OwnedStorage{input_float_buffer},
-                                tt_tensor.get_shape(),
-                                DataType::FLOAT32,
-                                tt_tensor.get_layout(),
-                                tile)
-                                .to(Layout::ROW_MAJOR);
-        auto output_float_data = owned_buffer::get_as<float>(float_tensor).get();
-        buffer = owned_buffer::create<float>(std::move(output_float_data));
-        tt_dtype = DataType::FLOAT32;
-    }
-    if (tt_dtype == DataType::BFLOAT4_B) {
-        TT_ASSERT(
-            std::holds_alternative<OwnedBuffer>(buffer),
-            "Unexpected type {}",
-            tt::stl::get_active_type_name_in_variant(buffer));
-        auto uint32_data = std::get<owned_buffer::Buffer<std::uint32_t>>(std::get<OwnedBuffer>(buffer)).get();
-        auto float_unpacked_data =
-            unpack_bfp4_tiles_into_float_vec(uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile);
-        auto input_float_buffer = owned_buffer::create<float>(std::move(float_unpacked_data));
-        auto float_tensor = Tensor(
-                                OwnedStorage{input_float_buffer},
-                                tt_tensor.get_shape(),
-                                DataType::FLOAT32,
-                                tt_tensor.get_layout(),
-                                tile)
-                                .to(Layout::ROW_MAJOR);
-        auto output_float_data = owned_buffer::get_as<float>(float_tensor).get();
-        buffer = owned_buffer::create<float>(std::move(output_float_data));
-        tt_dtype = DataType::FLOAT32;
-    }
-
     const auto tt_dtype_to_np_dtype = std::map<DataType, py::object>{
         {DataType::UINT8, np.attr("ubyte")},
         {DataType::UINT16, np.attr("int16")},  // TODO(arakhmati): add DataType::INT16
@@ -678,11 +526,11 @@ py::object convert_tt_tensor_to_numpy_tensor(const Tensor &tt_tensor) {
         {DataType::UINT32, np.attr("int32")},  // TODO(arakhmati): add DataType::INT32
         {DataType::FLOAT32, np.attr("float32")},
     };
-    auto np_dtype = tt_dtype_to_np_dtype.at(tt_dtype);
+    auto np_dtype = tt_dtype_to_np_dtype.at(buffer_dtype);
 
     auto shape = tt_tensor.get_legacy_shape();
     auto np_shape = std::vector<std::uint32_t>(std::begin(shape), std::end(shape));
-    auto tensor = frombuffer(buffer, "dtype"_a = np_dtype);
+    auto tensor = frombuffer(buffer, py::arg("dtype") = np_dtype);
     tensor = tensor.attr("reshape")(np_shape);
     tensor = np.attr("ascontiguousarray")(tensor);
     GraphTracker::instance().track_function_end(tensor);
diff --git a/ttnn/cpp/pybind11/tensor.cpp b/ttnn/cpp/pybind11/tensor.cpp
index 2d00d4ee2e3..0ae8831e906 100644
--- a/ttnn/cpp/pybind11/tensor.cpp
+++ b/ttnn/cpp/pybind11/tensor.cpp
@@ -18,6 +18,7 @@
 #include "ttnn/distributed/types.hpp"
 #include "tt_metal/host_api.hpp"
 
+using namespace tt::tt_metal;
 
 namespace py = pybind11;
 
diff --git a/ttnn/cpp/ttnn/async_runtime.cpp b/ttnn/cpp/ttnn/async_runtime.cpp
index 814db64804b..b47fc7c2705 100644
--- a/ttnn/cpp/ttnn/async_runtime.cpp
+++ b/ttnn/cpp/ttnn/async_runtime.cpp
@@ -7,6 +7,8 @@
 #include "ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/tensor/tensor_impl_wrapper.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn {
 
 void write_buffer(
diff --git a/ttnn/cpp/ttnn/decorators.hpp b/ttnn/cpp/ttnn/decorators.hpp
index 66f344fa365..bc818cef28f 100644
--- a/ttnn/cpp/ttnn/decorators.hpp
+++ b/ttnn/cpp/ttnn/decorators.hpp
@@ -73,7 +73,7 @@ inline auto create_async_output_tensors(
         return operation_t::create_async_optional_output_tensors(std::forward<decltype(args)>(args)...);
     } else if constexpr (std::is_same_v<std::decay_t<execute_on_worker_thread_return_t>, Tensor>) {
         return std::vector{
-            Tensor(operation::get_workers_for_op_output(inputs, optional_inputs, enable_autoformat_device))};
+            Tensor(tt::tt_metal::operation::get_workers_for_op_output(inputs, optional_inputs, enable_autoformat_device))};
 
     } else if constexpr (detail::is_homogenous_tuple<execute_on_worker_thread_return_t, Tensor>()) {
         Tensors output_tensors;
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp
index 92791a2832f..e4dd53d4eb0 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/bmm_tilize_untilize.cpp
@@ -141,18 +141,18 @@ void MAIN {
     bool spill = in0_num_blocks_w > 1;
 
     // CB indices
-    constexpr uint32_t in0_cb_id                                = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id                                = tt::CB::c_in1;
-    constexpr uint32_t matmul_partials_cb                       = tt::CB::c_intermed0;
-    constexpr uint32_t tilized_in0_cb_id                        = tt::CB::c_intermed1;
-    constexpr uint32_t untilize_mode_final_matmul_partials_cb   = tt::CB::c_intermed2;
-    constexpr uint32_t untilize_mode_reblock_cb                 = tt::CB::c_intermed3;
-    constexpr uint32_t out_cb_id                                = tt::CB::c_out0;
+    constexpr uint32_t in0_cb_id                                = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id                                = tt::CBIndex::c_1;
+    constexpr uint32_t matmul_partials_cb                       = tt::CBIndex::c_24;
+    constexpr uint32_t tilized_in0_cb_id                        = tt::CBIndex::c_25;
+    constexpr uint32_t untilize_mode_final_matmul_partials_cb   = tt::CBIndex::c_26;
+    constexpr uint32_t untilize_mode_reblock_cb                 = tt::CBIndex::c_27;
+    constexpr uint32_t out_cb_id                                = tt::CBIndex::c_16;
 
     #ifdef FUSE_BIAS
         uint32_t bias_ntiles_w = get_compile_time_arg_val(16);
-        constexpr uint32_t bias_cb_id                           = tt::CB::c_in2;
-        constexpr uint32_t out_for_bias_cb_id                   = tt::CB::c_intermed4;
+        constexpr uint32_t bias_cb_id                           = tt::CBIndex::c_2;
+        constexpr uint32_t out_for_bias_cb_id                   = tt::CBIndex::c_28;
         init_bcast<EltwiseBinaryType::ELWADD, BroadcastType::ROW>(out_for_bias_cb_id, bias_cb_id, out_cb_id);
     #endif
 
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/eltwise_copy.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/eltwise_copy.cpp
index 41e494d29b8..e8178160017 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/eltwise_copy.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/eltwise_copy.cpp
@@ -12,20 +12,20 @@ namespace NAMESPACE {
 void MAIN {
     uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
 
-    unary_op_init_common(tt::CB::c_in0);
+    unary_op_init_common(tt::CBIndex::c_0);
     for(uint32_t b=0;b<per_core_tile_cnt;++b)
     {
         acquire_dst();
 
         // Pop tile after tile, copy to DST and pack
-        cb_wait_front(tt::CB::c_in0, 1);
-        cb_reserve_back(tt::CB::c_out0, 1);
-        copy_tile(tt::CB::c_in0, 0, 0);
+        cb_wait_front(tt::CBIndex::c_0, 1);
+        cb_reserve_back(tt::CBIndex::c_16, 1);
+        copy_tile(tt::CBIndex::c_0, 0, 0);
 
-        pack_tile(0, tt::CB::c_out0);
+        pack_tile(0, tt::CBIndex::c_16);
 
-        cb_pop_front(tt::CB::c_in0, 1);
-        cb_push_back(tt::CB::c_out0, 1);
+        cb_pop_front(tt::CBIndex::c_0, 1);
+        cb_push_back(tt::CBIndex::c_16, 1);
 
         release_dst();
     }
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp
index 1bf3dfd4312..aab6f4187dd 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/tilize.cpp
@@ -14,17 +14,17 @@ void MAIN {
     uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
     uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
     //UNPACK(( DPRINT << "Block count=" << uint32_t(per_core_block_cnt) << " tile count=" << per_core_block_tile_cnt << ENDL() ));
-    tilize_init(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+    tilize_init(tt::CBIndex::c_0, per_core_block_tile_cnt, tt::CBIndex::c_16);
 
     for(uint32_t b=0;b<per_core_block_cnt;++b)
     {
-        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
 
-        tilize_block(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+        tilize_block(tt::CBIndex::c_0, per_core_block_tile_cnt, tt::CBIndex::c_16);
 
-        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
     }
 }
 }
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp
index 6f246194b87..110306a1a9d 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp
@@ -10,23 +10,23 @@ namespace NAMESPACE {
 void MAIN {
 
     uint32_t NHtWt = get_compile_time_arg_val(0);
-    transpose_wh_init(tt::CB::c_in0);
+    transpose_wh_init(tt::CBIndex::c_0);
 
     // transpose a row-major block:
     // - assumes the tiles come in in column major order from reader
     // - uses reader_unary_transpose_wh
     // - transpose_wh each tile
     for (uint32_t n = 0; n < NHtWt; n++) {
-        cb_wait_front(tt::CB::c_in0, 1);
-        cb_reserve_back(tt::CB::c_out0, 1);
+        cb_wait_front(tt::CBIndex::c_0, 1);
+        cb_reserve_back(tt::CBIndex::c_16, 1);
 
         acquire_dst();
-        transpose_wh_tile(tt::CB::c_in0, 0, 0);
-        pack_tile(0, tt::CB::c_out0);
+        transpose_wh_tile(tt::CBIndex::c_0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_16);
         release_dst();
 
-        cb_push_back(tt::CB::c_out0, 1);
-        cb_pop_front(tt::CB::c_in0, 1);
+        cb_push_back(tt::CBIndex::c_16, 1);
+        cb_pop_front(tt::CBIndex::c_0, 1);
     }
 }
 }
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp
index db0cca55e6c..6e80534cf78 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #include <stdint.h>
 
 #include <cstring>
@@ -71,7 +73,7 @@ class ArgFetcher {
     })
 
 template <bool DRAM>
-FORCE_INLINE InterleavedAddrGenFast<DRAM> InterleavedAddrGenFastHelper_(uint32_t addr, tt::CB cb, uint32_t idx) {
+FORCE_INLINE InterleavedAddrGenFast<DRAM> InterleavedAddrGenFastHelper_(uint32_t addr, tt::CBIndex cb, uint32_t idx) {
     uint32_t tile_bytes = get_tile_size(cb);
     auto data_format = get_dataformat(cb);
 
@@ -82,7 +84,7 @@ FORCE_INLINE InterleavedAddrGenFast<DRAM> InterleavedAddrGenFastHelper_(uint32_t
 }
 
 template <typename AddrGen>
-FORCE_INLINE void noc_async_read_tile_helper(tt::CB cb, uint32_t num_tiles, uint32_t tile_idx, AddrGen addr_gen) {
+FORCE_INLINE void noc_async_read_tile_helper(tt::CBIndex cb, uint32_t num_tiles, uint32_t tile_idx, AddrGen addr_gen) {
     cb_reserve_back(cb, num_tiles);
     uint32_t addr = get_write_ptr(cb);
     noc_async_read_tile(tile_idx, addr_gen, addr);
@@ -91,7 +93,7 @@ FORCE_INLINE void noc_async_read_tile_helper(tt::CB cb, uint32_t num_tiles, uint
 }
 
 template <typename AddrGen>
-FORCE_INLINE void noc_async_write_tile_helper(tt::CB cb, uint32_t num_tiles, uint32_t tile_idx, AddrGen addr_gen) {
+FORCE_INLINE void noc_async_write_tile_helper(tt::CBIndex cb, uint32_t num_tiles, uint32_t tile_idx, AddrGen addr_gen) {
     cb_wait_front(cb, num_tiles);
     uint32_t l1_read_addr = get_read_ptr(cb);
     noc_async_write_tile(tile_idx, addr_gen, l1_read_addr);
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/writer_unary_stick_layout_interleaved_blocks.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/writer_unary_stick_layout_interleaved_blocks.cpp
index 354a1fe054e..2b8b6bf8f47 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/writer_unary_stick_layout_interleaved_blocks.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/writer_unary_stick_layout_interleaved_blocks.cpp
@@ -54,7 +54,7 @@ void kernel_main() {
 
     // NOTE: Row major layout only supports bfp16
     // TT_ASSERT(out_df != DataFormat::Bfp8_b);
-    constexpr uint32_t cb_id_out0 = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16;
     const DataFormat out_df = get_dataformat(cb_id_out0);
 
     constexpr uint32_t TILE_HEIGHT = 32;                    // TODO: use common source of truth
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/kernels/moreh_matmul.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/kernels/moreh_matmul.cpp
index d444ebe3b47..a6b56c2e63b 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/kernels/moreh_matmul.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_matmul/multi_core/kernels/moreh_matmul.cpp
@@ -19,15 +19,15 @@ constexpr uint32_t num_mask_tiles = 3;
 constexpr uint32_t MASK_TILE_H_IDX = 0;
 constexpr uint32_t MASK_TILE_W_IDX = 1;
 constexpr uint32_t MASK_TILE_HW_IDX = 2;
-constexpr uint32_t cb_in0 = tt::CB::c_in0;
-constexpr uint32_t cb_in1 = tt::CB::c_in1;
-constexpr uint32_t cb_in2 = tt::CB::c_in2;
-constexpr uint32_t cb_in3 = tt::CB::c_in3;
-constexpr uint32_t bias_cb_id = tt::CB::c_in4;
-constexpr uint32_t cb_out0 = tt::CB::c_out0;
-constexpr uint32_t cb_intermed0 = tt::CB::c_intermed0;
-constexpr uint32_t cb_intermed1 = tt::CB::c_intermed1;
-constexpr uint32_t cb_intermed2 = tt::CB::c_intermed2;
+constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+constexpr uint32_t cb_in1 = tt::CBIndex::c_1;
+constexpr uint32_t cb_in2 = tt::CBIndex::c_2;
+constexpr uint32_t cb_in3 = tt::CBIndex::c_3;
+constexpr uint32_t bias_cb_id = tt::CBIndex::c_4;
+constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
+constexpr uint32_t cb_intermed0 = tt::CBIndex::c_24;
+constexpr uint32_t cb_intermed1 = tt::CBIndex::c_25;
+constexpr uint32_t cb_intermed2 = tt::CBIndex::c_26;
 
 ////////////////////
 // inline functions
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/kernels/moreh_sum_nc.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/kernels/moreh_sum_nc.cpp
index b4a35226e8c..9af0e3630ec 100644
--- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/kernels/moreh_sum_nc.cpp
+++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/moreh_sum/moreh_sum_nc_impl/kernels/moreh_sum_nc.cpp
@@ -10,9 +10,9 @@ void MAIN {
     constexpr uint32_t num_output_tiles = get_compile_time_arg_val(0);
     constexpr uint32_t num_input_tiles = get_compile_time_arg_val(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp
index b1604e8f997..e6d4320b494 100644
--- a/ttnn/cpp/ttnn/device_operation.hpp
+++ b/ttnn/cpp/ttnn/device_operation.hpp
@@ -259,7 +259,7 @@ void launch_on_worker_thread(auto cq_id, auto device_operation_id, const auto& o
         device_operation_t::validate_on_program_cache_miss(operation_attributes, tensor_args);
     }
 
-    const auto enqueue_or_launch_program = [=](Program& program) {
+    const auto enqueue_or_launch_program = [=](tt::tt_metal::Program& program) {
         if (USE_FAST_DISPATCH) {
             ZoneScopedN("EnqueueProgram");
             auto& queue = device->command_queue(cq_id);
@@ -276,8 +276,8 @@ void launch_on_worker_thread(auto cq_id, auto device_operation_id, const auto& o
 
         program.set_runtime_id(device_operation_id);
 
-        GraphTracker::instance().track_program(&program);
-        if(GraphTracker::instance().hook_program(&program)) {
+        tt::tt_metal::GraphTracker::instance().track_program(&program);
+        if(tt::tt_metal::GraphTracker::instance().hook_program(&program)) {
             return;
         }
 
@@ -305,8 +305,8 @@ void launch_on_worker_thread(auto cq_id, auto device_operation_id, const auto& o
 
         program->set_runtime_id(device_operation_id);
 
-        GraphTracker::instance().track_program(program.get());
-        if(GraphTracker::instance().hook_program(program.get())) {
+        tt::tt_metal::GraphTracker::instance().track_program(program.get());
+        if(tt::tt_metal::GraphTracker::instance().hook_program(program.get())) {
             return;
         }
 
@@ -456,7 +456,7 @@ typename device_operation_t::tensor_return_value_t invoke(
     ZoneScopedN("Run Device Operation");
 
     // TODO: Add GraphTracker::instance().track_device_operation to track device operations specifically?
-    GraphTracker::instance().track_function_start(get_operation_name<device_operation_t>(operation_attributes), operation_attributes, tensor_args);
+    tt::tt_metal::GraphTracker::instance().track_function_start(get_operation_name<device_operation_t>(operation_attributes), operation_attributes, tensor_args);
 
 
     using tensor_return_value_t = typename device_operation_t::tensor_return_value_t;
@@ -487,7 +487,7 @@ typename device_operation_t::tensor_return_value_t invoke(
     }
     */
 
-    GraphTracker::instance().track_function_end(tensor_return_value);
+    tt::tt_metal::GraphTracker::instance().track_function_end(tensor_return_value);
     return tensor_return_value;
 }
 
diff --git a/ttnn/cpp/ttnn/distributed/api.cpp b/ttnn/cpp/ttnn/distributed/api.cpp
index 572c9171b85..04dd33cca17 100644
--- a/ttnn/cpp/ttnn/distributed/api.cpp
+++ b/ttnn/cpp/ttnn/distributed/api.cpp
@@ -10,6 +10,8 @@
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "tt_metal/distributed/mesh_device.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::distributed::api {
 
 std::shared_ptr<MeshDevice> open_mesh_device(const MeshShape& mesh_shape, size_t l1_small_size, size_t trace_region_size, size_t num_command_queues, DispatchCoreType dispatch_core_type, MeshType mesh_type, const std::pair<size_t, size_t>& offset, const std::vector<int>& physical_device_ids) {
diff --git a/ttnn/cpp/ttnn/distributed/api.hpp b/ttnn/cpp/ttnn/distributed/api.hpp
index da333b5340c..b9911038145 100644
--- a/ttnn/cpp/ttnn/distributed/api.hpp
+++ b/ttnn/cpp/ttnn/distributed/api.hpp
@@ -16,7 +16,7 @@ std::shared_ptr<MeshDevice> open_mesh_device(
     size_t l1_small_size,
     size_t trace_region_size,
     size_t num_command_queues,
-    DispatchCoreType dispatch_core_type,
+    tt::tt_metal::DispatchCoreType dispatch_core_type,
     MeshType mesh_type = MeshType::RowMajor,
     const std::pair<size_t, size_t>& offset = std::pair<size_t, size_t>(0, 0),
     const std::vector<int>& physical_device_ids = {});
@@ -33,10 +33,10 @@ std::vector<int> get_t3k_physical_device_ids_ring();
 std::vector<Device*> distribute_tensor_to_mesh(const Tensor& tensor, MeshDevice& mesh_device);
 
 // Get the distributed tensor config from a tensor.
-DistributedTensorConfig get_distributed_tensor_config_from_tensor(const Tensor& tensor);
+tt::tt_metal::DistributedTensorConfig get_distributed_tensor_config_from_tensor(const Tensor& tensor);
 
 // Given a multi-device tensor and a device, returns the tensor on the given device.
-Tensor get_device_tensor(const Tensor& multi_device_tensor, const Device* device);
+Tensor get_device_tensor(const Tensor& multi_device_tensor, const tt::tt_metal::Device* device);
 Tensor get_device_tensor(const Tensor& multi_device_tensor, const int device_id);
 
 
@@ -48,7 +48,7 @@ std::vector<Tensor> get_tensors_from_multi_device_storage(const Tensor& multi_de
 
 // Given a list of per-device shards, return a multi-device tensor
 Tensor create_multi_device_tensor(
-    const std::vector<Tensor>& tensors, StorageType storage_type, const DistributedTensorConfig& strategy);
+    const std::vector<Tensor>& tensors, tt::tt_metal::StorageType storage_type, const tt::tt_metal::DistributedTensorConfig& strategy);
 
 }  // namespace ttnn::distributed::api
 
diff --git a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp
index 9a43a044ff5..2e148f3a5db 100644
--- a/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp
+++ b/ttnn/cpp/ttnn/distributed/distributed_pybind.cpp
@@ -11,6 +11,8 @@
 #include "tt_metal/impl/dispatch/command_queue.hpp"
 #include "pybind11/stl.h"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::distributed {
 
 namespace py = pybind11;
diff --git a/ttnn/cpp/ttnn/events.cpp b/ttnn/cpp/ttnn/events.cpp
index 08e609e495f..aad988ddee1 100644
--- a/ttnn/cpp/ttnn/events.cpp
+++ b/ttnn/cpp/ttnn/events.cpp
@@ -8,6 +8,8 @@
 #include "tt_metal/impl/event/event.hpp"
 #include "ttnn/distributed/types.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::events {
 
 MultiDeviceEvent::MultiDeviceEvent(MeshDevice* mesh_device) {
diff --git a/ttnn/cpp/ttnn/graph/graph_processor.cpp b/ttnn/cpp/ttnn/graph/graph_processor.cpp
index 3cb613b4401..f702edc1a26 100644
--- a/ttnn/cpp/ttnn/graph/graph_processor.cpp
+++ b/ttnn/cpp/ttnn/graph/graph_processor.cpp
@@ -15,6 +15,8 @@
 #include <unordered_map>
 #include "ttnn/core.hpp"
 
+using namespace tt::tt_metal;
+
 namespace {
 std::string demangle(const char* name) {
 
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
index d1f31d36804..833c3a4b706 100644
--- a/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/bernoulli_program_factory.cpp
@@ -40,14 +40,14 @@ BernoulliDeviceOperation::ProgramFactory::cached_program_t BernoulliDeviceOperat
     constexpr uint32_t num_tiles = 2;
     auto in_data_format = datatype_to_dataformat_converter(input.dtype());
     const uint32_t in_dtype_tile_size = tile_size(in_data_format);
-    constexpr uint32_t in_cb_id = CB::c_in0;
+    constexpr uint32_t in_cb_id = CBIndex::c_0;
     CircularBufferConfig cb_in_config =
         CircularBufferConfig(num_tiles * in_dtype_tile_size, {{in_cb_id, in_data_format}})
             .set_page_size(in_cb_id, in_dtype_tile_size);
     CBHandle cb_input = tt_metal::CreateCircularBuffer(program, all_cores, cb_in_config);
 
     const uint32_t float32_tile_size = tile_size(tt::DataFormat::Float32);
-    constexpr uint32_t intermed_cb_id = CB::c_intermed0;
+    constexpr uint32_t intermed_cb_id = CBIndex::c_24;
     CircularBufferConfig cb_intermed_config =
         CircularBufferConfig(num_tiles * float32_tile_size, {{intermed_cb_id, tt::DataFormat::Float32}})
             .set_page_size(intermed_cb_id, float32_tile_size);
@@ -55,7 +55,7 @@ BernoulliDeviceOperation::ProgramFactory::cached_program_t BernoulliDeviceOperat
 
     auto out_data_format = datatype_to_dataformat_converter(output.dtype());
     const uint32_t out_dtype_tile_size = tile_size(out_data_format);
-    constexpr uint32_t intermed1_cb_id = CB::c_intermed1;
+    constexpr uint32_t intermed1_cb_id = CBIndex::c_25;
     CircularBufferConfig cb_intermed1_config =
         CircularBufferConfig(1 * out_dtype_tile_size, {{intermed1_cb_id, out_data_format}})
             .set_page_size(intermed1_cb_id, out_dtype_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp
index 816ad043d7e..3b95aff5a3c 100644
--- a/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp
+++ b/ttnn/cpp/ttnn/operations/bernoulli/device/kernels/compute_bernoulli.cpp
@@ -16,7 +16,7 @@ void MAIN {
     const uint32_t num_tiles = get_arg_val<uint32_t>(2);
     const uint32_t end_id = start_id + num_tiles;
 
-    init_sfpu(intermed_cb_id);
+    init_sfpu(intermed_cb_id, intermed_cb_id);
 
     union f2u {
         float f;
diff --git a/ttnn/cpp/ttnn/operations/cb_utils.hpp b/ttnn/cpp/ttnn/operations/cb_utils.hpp
index f86fe6a5e7a..907d98be021 100644
--- a/ttnn/cpp/ttnn/operations/cb_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/cb_utils.hpp
@@ -9,8 +9,8 @@
 namespace tt::tt_metal {
 
 template <size_t N>
-std::tuple<std::array<CB, N>, CBHandle> create_cb(
-    const CB (&cbs)[N],
+std::tuple<std::array<uint32_t, N>, CBHandle> create_cb(
+    const uint32_t (&cbs)[N],
     Program &program,
     const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec,
     uint32_t page_size,
@@ -31,20 +31,20 @@ std::tuple<std::array<CB, N>, CBHandle> create_cb(
         cb_config.set_globally_allocated_address(*buffer);
     }
 
-    std::array<CB, N> cbs_out;
+    std::array<uint32_t, N> cbs_out;
     std::copy(cbs, cbs + N, cbs_out.begin());
     return std::make_tuple(cbs_out, tt_metal::CreateCircularBuffer(program, core_spec, cb_config));
 }
 
-inline std::tuple<CB, CBHandle> create_cb(
-    CB cb,
+inline std::tuple<uint32_t, CBHandle> create_cb(
+    uint32_t cb,
     Program &program,
     const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec,
     uint32_t page_size,
     uint32_t num_pages,
     const tt::DataFormat data_format,
     Buffer *buffer = nullptr) {
-    CB cbs[] = {cb};
+    uint32_t cbs[] = {cb};
     auto [_, handle] = create_cb(cbs, program, core_spec, page_size, num_pages, data_format, buffer);
     return std::make_tuple(cb, handle);
 }
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.cpp
index 63983bd9f01..34f067df23d 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.cpp
@@ -9,7 +9,7 @@
 namespace ttnn::operations::ccl {
 
 ttnn::Tensor ExecuteAllGather::invoke(const ttnn::Tensor& input_tensor,
-    const uint32_t dim,
+    const int32_t dim,
     const uint32_t num_links,
     const std::optional<ttnn::MemoryConfig>& memory_config,
     const std::optional<size_t> num_workers,
@@ -21,7 +21,7 @@ ttnn::Tensor ExecuteAllGather::invoke(const ttnn::Tensor& input_tensor,
 
 ttnn::Tensor ExecuteAllGather::invoke(
     const ttnn::Tensor& input_tensor,
-    const uint32_t dim,
+    const int32_t dim,
     const uint32_t cluster_axis,
     const MeshDevice& mesh_device,
     const uint32_t num_links,
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.hpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.hpp
index 1816d4c083d..541335982fa 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather.hpp
@@ -14,7 +14,7 @@ namespace ccl {
 struct ExecuteAllGather {
     static ttnn::Tensor invoke(
         const ttnn::Tensor& input_tensor,
-        const uint32_t dim,
+        const int32_t dim,
         const uint32_t num_links = 1,
         const std::optional<ttnn::MemoryConfig>& memory_config = std::nullopt,
         const std::optional<size_t> num_workers = std::nullopt,
@@ -23,7 +23,7 @@ struct ExecuteAllGather {
 
     static ttnn::Tensor invoke(
         const ttnn::Tensor& input_tensor,
-        const uint32_t dim,
+        const int32_t dim,
         const uint32_t cluster_axis,
         const MeshDevice& mesh_device,
         const uint32_t num_links = 1,
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp
index 8937ced1230..19de7aa652b 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/all_gather_pybind.cpp
@@ -29,7 +29,7 @@ void bind_all_gather(pybind11::module& module, const ccl_operation_t& operation,
         ttnn::pybind_overload_t{
             [](const ccl_operation_t& self,
                const ttnn::Tensor& input_tensor,
-               const uint32_t dim,
+               const int32_t dim,
                const uint32_t num_links,
                const std::optional<ttnn::MemoryConfig>& memory_config,
                const std::optional<size_t> num_workers,
@@ -49,7 +49,7 @@ void bind_all_gather(pybind11::module& module, const ccl_operation_t& operation,
         ttnn::pybind_overload_t{
             [](const ccl_operation_t& self,
                const ttnn::Tensor& input_tensor,
-               const uint32_t dim,
+               const int32_t dim,
                const uint32_t cluster_axis,
                const MeshDevice& mesh_device,
                const uint32_t num_links,
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
index 32fc7afb01a..81b295dd3fc 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.cpp
@@ -175,7 +175,7 @@ namespace operations {
 namespace ccl {
 
 Tensor all_gather(
-    const Tensor& input_tensor, const uint32_t dim, const uint32_t num_links, const std::optional<MemoryConfig>& memory_config, const std::optional<size_t> user_defined_num_workers, const std::optional<size_t> user_defined_num_buffers_per_channel, const ttnn::ccl::Topology topology) {
+    const Tensor& input_tensor, const int32_t dim, const uint32_t num_links, const std::optional<MemoryConfig>& memory_config, const std::optional<size_t> user_defined_num_workers, const std::optional<size_t> user_defined_num_buffers_per_channel, const ttnn::ccl::Topology topology) {
 
     TT_FATAL(std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr, "all_gather op is only supported for Fast Dispatch");
     auto devices = input_tensor.get_workers();
@@ -186,9 +186,16 @@ Tensor all_gather(
     if (num_devices == 2){
         ccl_topology = ttnn::ccl::Topology::Linear;
     }
+
+    int32_t rank = input_tensor.get_logical_shape().rank();
+
+    int32_t gather_dim = (dim < 0) ? rank + dim : dim;
+
+    TT_FATAL(gather_dim >= -rank && gather_dim <= rank - 1 , "Dimension input should be in between -{} and {}, but has {}", rank, rank - 1, dim);
+
     std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
     operation::launch_op(
-        [dim, num_links, memory_config, user_defined_num_workers, user_defined_num_buffers_per_channel, devices, ccl_topology](
+        [gather_dim, num_links, memory_config, user_defined_num_workers, user_defined_num_buffers_per_channel, devices, ccl_topology](
             const std::vector<Tensor>& input_tensors,
             const std::vector<std::optional<const Tensor>>& optional_input_tensors,
             const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
@@ -196,7 +203,7 @@ Tensor all_gather(
             const auto& input_tensor = input_tensors.at(0);
 
             return operation::run(
-                ttnn::ccl::all_gather_detail::create_all_gather_struct(input_tensor, dim, num_links, memory_config, user_defined_num_workers, user_defined_num_buffers_per_channel, devices, ccl_topology),
+                ttnn::ccl::all_gather_detail::create_all_gather_struct(input_tensor, gather_dim, num_links, memory_config, user_defined_num_workers, user_defined_num_buffers_per_channel, devices, ccl_topology),
                 {input_tensor});
         },
         {input_tensor},
@@ -206,7 +213,7 @@ Tensor all_gather(
 
 Tensor all_gather(
     const Tensor& input_tensor,
-    const uint32_t dim,
+    const int32_t dim,
     const uint32_t cluster_axis,
     const MeshDevice& mesh_device,
     const uint32_t num_links,
@@ -219,10 +226,16 @@ Tensor all_gather(
     const auto mesh_view = mesh_device.get_view();
     std::size_t num_devices = (cluster_axis == 0) ? mesh_view->num_rows() : mesh_view->num_cols();
 
+    int32_t rank = input_tensor.get_logical_shape().rank();
+
+    int32_t gather_dim = (dim < 0) ? rank + dim : dim;
+
+    TT_FATAL(gather_dim >= -rank && gather_dim <= rank - 1 , "Dimension input should be in between -{} and {}, but has {}", rank, rank - 1, dim);
+
     std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
 
     operation::launch_op(
-        [dim, num_links, memory_config, mesh_view, cluster_axis, user_defined_num_workers, user_defined_num_buffers_per_channel, num_devices, topology](
+        [gather_dim, num_links, memory_config, mesh_view, cluster_axis, user_defined_num_workers, user_defined_num_buffers_per_channel, num_devices, topology](
             const std::vector<Tensor>& input_tensors,
             const std::vector<std::optional<const Tensor>>& optional_input_tensors,
             const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
@@ -250,7 +263,7 @@ Tensor all_gather(
 
             return operation::run(
                 ttnn::AllGather{
-                    dim, num_links, num_devices, device_index, user_defined_num_workers, user_defined_num_buffers_per_channel, receiver_device_id, sender_device_id, memory_config.value_or(input_device_tensor.memory_config()), topology},
+                    gather_dim, num_links, num_devices, device_index, user_defined_num_workers, user_defined_num_buffers_per_channel, receiver_device_id, sender_device_id, memory_config.value_or(input_device_tensor.memory_config()), topology},
                 {input_device_tensor});
         },
         {input_tensor},
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
index b0a162f2a1f..abc697dfab5 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/all_gather_op.hpp
@@ -200,7 +200,7 @@ namespace ccl {
 
 Tensor all_gather(
     const Tensor& input_tensor,
-    const uint32_t dim,
+    const int32_t dim,
     const uint32_t num_links = 1,
     const std::optional<MemoryConfig>& memory_config = std::nullopt,
     const std::optional<size_t> user_defined_num_workers = std::nullopt,
@@ -209,7 +209,7 @@ Tensor all_gather(
 
 Tensor all_gather(
     const Tensor& input_tensor,
-    const uint32_t dim,
+    const int32_t dim,
     const uint32_t cluster_axis,
     const MeshDevice& mesh_device,
     const uint32_t num_links = 1,
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_reader.cpp
index aee59fd6213..6a3cdd26e42 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_reader.cpp
@@ -26,7 +26,7 @@ void kernel_main() {
 
     ASSERT(half_cb_n_pages > rem_num_pages);
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     ccl::edm::WorkerToEdmReader<ttnn::ccl::EriscDataMoverTerminationMode::MESSAGE_COUNT_REACHED> reader(
         ttnn::ccl::WorkerXY(eth_receiver_noc_x, eth_receiver_noc_y),
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_writer.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_writer.cpp
index 8e10559a870..da76c6f3d65 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_receive_writer.cpp
@@ -69,7 +69,7 @@ void kernel_main() {
     constexpr bool output_tensor_shard_grid_transposed = get_compile_time_arg_val(15) != 0;
     #endif
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
     #ifdef ROW_MAJOR_LAYOUT
         #ifdef INTERLEAVED_MEM_LAYOUT
         InterleavedAddrGen<dst_is_dram> d = {
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_reader.cpp
index 9d937029a5d..39c78e2d93f 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_reader.cpp
@@ -80,7 +80,7 @@ void kernel_main() {
 
     ASSERT(half_cb_n_pages > rem_num_pages);
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     #ifdef ROW_MAJOR_LAYOUT
         #ifdef INTERLEAVED_MEM_LAYOUT
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_writer.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_writer.cpp
index 96ea1aaecae..19505d9b09a 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_writer.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/kernels/dataflow/worker_interleaved_ring_gather_send_writer.cpp
@@ -70,7 +70,7 @@ void kernel_main() {
     constexpr bool output_tensor_shard_grid_transposed = get_compile_time_arg_val(15) != 0;
     #endif
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
     #ifdef ROW_MAJOR_LAYOUT
         #ifdef INTERLEAVED_MEM_LAYOUT
         InterleavedAddrGen<dst_is_dram> d = {
diff --git a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
index 6d3fdbfa69d..92e5e2e979c 100644
--- a/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/all_gather/device/multi_core/all_gather_op_multi_core.cpp
@@ -367,7 +367,7 @@ operation::ProgramWithCallbacks all_gather_multi_core_with_workers_helper(
 
     // Circular Buffer Setup
     log_trace(tt::LogOp, "input_page_size: {}", input_page_size);
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     const uint32_t cb_n_packets = 2;
     const uint32_t cb_size_in_pages = cb_n_packets * max_pages_per_chunk;
     const uint32_t CB_buffer_size = cb_n_packets * max_buffer_per_chunk;
diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp
index b30dedf4caf..df3836c152c 100644
--- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp
@@ -17,6 +17,8 @@
 #include "ttnn/operations/eltwise/binary/common/binary_op_types.hpp"
 #include "ttnn/operations/eltwise/binary/common/binary_op_utils.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::ccl::barrier::detail {
 
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp b/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp
index a5e6d49c184..f714f3e44ca 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_common.hpp
@@ -29,7 +29,7 @@ std::tuple<uint32_t, std::optional<chip_id_t>, std::optional<chip_id_t>> get_dev
 // Eventual home: ccl_topology_descriptors
 struct RingTopology {
     RingTopology(
-        Device const* device,
+        tt::tt_metal::Device const* device,
         Topology topology,
         std::optional<uint32_t> sender_device_id,
         std::optional<uint32_t> receiver_device_id,
@@ -40,7 +40,7 @@ struct RingTopology {
     bool is_first_device_in_line(bool in_clockwise_direction) const;
     bool is_last_device_in_line(bool in_clockwise_direction) const;
 
-    const Device *device;
+    const tt::tt_metal::Device *device;
 
     std::vector<CoreCoord> eth_sender_cores;
     std::vector<CoreCoord> eth_receiver_cores;
@@ -81,11 +81,11 @@ class CclOpShardedTensorConfig final : public virtual CclOpTensorConfig {
    public:
     CclOpShardedTensorConfig(Tensor const& tensor);
 
-    ShardSpec const& get_shard_spec() const;
+    tt::tt_metal::ShardSpec const& get_shard_spec() const;
 
    private:
     uint32_t page_size;
-    ShardSpec const shard_spec;
+    tt::tt_metal::ShardSpec const shard_spec;
 };
 
 struct CclTensorSlicer {
@@ -398,7 +398,7 @@ class InterleavedRingAllGatherTensorSlicer : public LegacyCclTensorSlicer {
     InterleavedRingAllGatherTensorSlicer(
         Tensor const& input_tensor, Tensor const& output_tensor, int slice_dim, uint32_t slice_idx) :
         LegacyCclTensorSlicer() {
-        this->row_major = input_tensor.get_layout() == Layout::ROW_MAJOR;
+        this->row_major = input_tensor.get_layout() == tt::tt_metal::Layout::ROW_MAJOR;
         this->slice_dim_is_width = input_tensor.get_legacy_shape().rank() - 1 == slice_dim;
         this->is_sharded = input_tensor.is_sharded();
 
@@ -472,14 +472,14 @@ class InterleavedRingAllGatherTensorSlicer : public LegacyCclTensorSlicer {
 };
 
 
-KernelHandle generate_edm_kernel(
+tt::tt_metal::KernelHandle generate_edm_kernel(
    tt::tt_metal::Program& program,
-    Device const* device,
+    tt::tt_metal::Device const* device,
     FabricEriscDatamoverBuilder const& edm_builder,
     CoreCoord const& eth_core,
     NOC noc_id);
 
-KernelHandle generate_edm_kernel(
+tt::tt_metal::KernelHandle generate_edm_kernel(
    tt::tt_metal::Program& program,
     Device const* device,
     EriscDatamoverBuilder const& edm_builder,
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
index e20bc28435d..8609fd23ab4 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.cpp
@@ -6,6 +6,8 @@
 #include "ttnn/cpp/ttnn/tensor/tensor_impl.hpp"
 #include "ttnn/cpp/ttnn/operations/ccl/ccl_host_datastructures.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn {
 namespace ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp b/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp
index 29a6b9197ac..30dc04289dc 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.cpp
@@ -6,6 +6,8 @@
 #include "tt_metal/impl/program/program.hpp"
 #include "ttnn/operations/ccl/ccl_op_fusion.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn {
 namespace experimental {
 namespace ccl {
diff --git a/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.hpp b/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.hpp
index a8a3d73e9b0..d5dba799f90 100644
--- a/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/ccl_op_fusion.hpp
@@ -50,8 +50,8 @@ struct AllGatherFusedOpSignaler {
     );
 
     void init_all_gather(
-        Program& program,
-        Device const* device,
+        tt::tt_metal::Program& program,
+        tt::tt_metal::Device const* device,
 
         CoreRangeSet const& all_gather_workers,
         std::vector<CoreCoord>& all_gather_worker_cores
@@ -100,8 +100,8 @@ struct MatmulFusedOpSignaler {
     );
 
     void init_fused_op(
-        Program& program,
-        Device const* device,
+        tt::tt_metal::Program& program,
+        tt::tt_metal::Device const* device,
         const std::variant<CoreRange, CoreRangeSet>& core_range_to_signal,
         FusedOpSignalerMode fused_op_signaler_mode = FusedOpSignalerMode::MULTI
     );
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send.cpp
index f1028a69fbb..2a0629fab12 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send.cpp
@@ -223,7 +223,7 @@ void kernel_main() {
     //    out when we start enabling other modes
     const uint32_t packet_size_in_pages = get_arg_val<uint32_t>(arg_idx++);
     const uint32_t page_size = get_arg_val<uint32_t>(arg_idx++);
-    auto tensor_addrgen = build_source_address_generator<tensor_layout, buffer_type, page_layout>(arg_idx, tensor_address, page_size, tt::CB::c_in0);
+    auto tensor_addrgen = build_source_address_generator<tensor_layout, buffer_type, page_layout>(arg_idx, tensor_address, page_size, tt::CBIndex::c_0);
     auto semaphore_id = get_arg_val<uint32_t>(arg_idx++);
     volatile uint32_t* my_edm_worker_semaphore_ptr = reinterpret_cast<volatile uint32_t*>(get_semaphore(semaphore_id));
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp
index 546c65b6fc9..af4ebffc70f 100644
--- a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/tensor/types.hpp"
 #include "tt_metal/impl/device/device.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn {
 namespace ccl {
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
index efe3ce45ad5..54fd31456d9 100644
--- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
@@ -107,8 +107,8 @@ class FabricEriscDatamoverBuilder {
         FabricEriscDatamoverConfig const& config);
 
     static FabricEriscDatamoverBuilder build(
-        Device* device,
-        Program& program,
+        tt::tt_metal::Device* device,
+        tt::tt_metal::Program& program,
         CoreCoord const& ethernet_core,
         chip_id_t local_chip_id,
         chip_id_t peer_chip_id,
@@ -209,7 +209,7 @@ struct EdmLineFabricOpInterface {
     // Will create a connection adapter for a worker which can be used to pass args to the worker kernel talking to the
     // corresponding fabric endpoint. This interface will guarantee unique connections only so requesting more unique connections
     // than available will result in an error.
-    SenderWorkerAdapterSpec uniquely_connect_worker(Device* device, Direction direction);
+    SenderWorkerAdapterSpec uniquely_connect_worker(tt::tt_metal::Device* device, Direction direction);
 
     // builds the ethernet kernels for all EDMs in the "fabric"
     void build_kernels() const;
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
index 55a52f80110..55417dc7653 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/host/reduce_scatter_full_worker_grid.cpp
@@ -25,6 +25,7 @@
 #include <ranges>
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 // Notes on abbreviations:
 // cw = clockwise
@@ -518,7 +519,7 @@ create_worker_circular_buffers(
     uint32_t page_size_bytes = op_config.get_page_size();
 
     // Input 0 CB
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     tt::tt_metal::CircularBufferConfig cb_src0_config =
         tt::tt_metal::CircularBufferConfig(worker_pages_per_transfer * page_size_bytes, {{src0_cb_index, df}})
             .set_page_size(src0_cb_index, page_size_bytes);
@@ -529,7 +530,7 @@ create_worker_circular_buffers(
     }
 
     // Input 1 CB
-    uint32_t src1_cb_index = tt::CB::c_in1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     tt::tt_metal::CircularBufferConfig cb_src1_config =
         tt::tt_metal::CircularBufferConfig(worker_pages_per_transfer * page_size_bytes, {{src1_cb_index, df}})
             .set_page_size(src1_cb_index, page_size_bytes);
@@ -540,7 +541,7 @@ create_worker_circular_buffers(
     }
 
     // Dataflow Writer Kernel input CB
-    uint32_t cb_dst0_index = tt::CB::c_out0;
+    uint32_t cb_dst0_index = tt::CBIndex::c_2;
     tt::tt_metal::CircularBufferConfig cb_dst0_config =
         tt::tt_metal::CircularBufferConfig(worker_pages_per_transfer * page_size_bytes, {{cb_dst0_index, df}})
             .set_page_size(cb_dst0_index, page_size_bytes);
@@ -553,7 +554,7 @@ create_worker_circular_buffers(
     // From reader -> writer kernel (I think I need this because sharing the cb_dst0_sender_workers as output
     // of reader kernel (first output) and math kernel (all subsequent outputs) doesn't seem to work because
     // it seems like the math kernels hold some of the CB state in local variables)
-    uint32_t cb_short_circuit_index = tt::CB::c_out1;
+    uint32_t cb_short_circuit_index = tt::CBIndex::c_3;
     tt::tt_metal::CircularBufferConfig cb_short_circuit_config =
         tt::tt_metal::CircularBufferConfig(
             (worker_pages_per_transfer * page_size_bytes) * 2, {{cb_short_circuit_index, df}})
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_reader.cpp
index 10254c8364d..82571bce907 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_reader.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_reader.cpp
@@ -346,9 +346,9 @@ struct signal_receiver {
 void kernel_main() {
     std::size_t arg_idx = 0;
 
-    constexpr uint32_t to_dm_sender_short_circuit_cb = tt::CB::c_out1;
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_id_in1 = tt::CB::c_in1;
+    constexpr uint32_t to_dm_sender_short_circuit_cb = tt::CBIndex::c_3;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1;
     auto args = reduce_scatter_reader_common_args_t(arg_idx, get_dataformat(cb_id_in0));
     auto output_partial_signal_ready_receiver = signal_receiver<is_line_reduce_scatter>::build(args.requires_last_input_from_other_sender, arg_idx);
 
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_sender.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_sender.cpp
index 86e618e8c2a..7bb29a59c80 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_sender.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/kernels/worker_interleaved_ring_reduce_scatter_sender.cpp
@@ -112,8 +112,8 @@ void kernel_main() {
     ASSERT(page_size > 0);
     ASSERT(half_cb_n_pages > 0);
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_out0;
-    constexpr uint32_t cb_id_in_short_circuit = tt::CB::c_out1;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_2;
+    constexpr uint32_t cb_id_in_short_circuit = tt::CBIndex::c_3;
     const DataFormat in0_df = get_dataformat(cb_id_in0);
 #ifdef ROW_MAJOR_LAYOUT
     #ifdef INTERLEAVED_MEM_LAYOUT
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
index 0924001d006..44ee7916127 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.cpp
@@ -107,7 +107,7 @@ namespace operations{
 namespace ccl{
 Tensor reduce_scatter(
     const Tensor& input_tensor,
-    const uint32_t scatter_dim,
+    const int32_t dim,
     ttnn::operations::reduction::ReduceType math_op,
     const uint32_t num_links,
     const MemoryConfig& output_mem_config,
@@ -126,6 +126,12 @@ Tensor reduce_scatter(
         ccl_topology = ttnn::ccl::Topology::Linear;
     }
 
+    int16_t rank = input_tensor.get_logical_shape().rank();
+
+    int16_t scatter_dim = (dim < 0) ? rank + dim : dim;
+
+    TT_FATAL(scatter_dim >= -rank && scatter_dim <= rank - 1 , "Dimension input should be in between -{} and {}, but has {}", rank, rank - 1, dim);
+
     std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
     operation::launch_op(
         [binary_op_type, scatter_dim, num_links, output_mem_config, ccl_topology, devices, user_defined_num_workers, user_defined_num_buffers_per_channel](
@@ -158,7 +164,7 @@ Tensor reduce_scatter(
 
 Tensor reduce_scatter(
     const Tensor &input_tensor,
-    const uint32_t scatter_dim,
+    const int32_t dim,
     const uint32_t cluster_axis,
     const MeshDevice& mesh_device,
     ttnn::operations::reduction::ReduceType reduce_op,
@@ -174,6 +180,12 @@ Tensor reduce_scatter(
     const auto mesh_view = mesh_device.get_view();
     std::size_t num_devices = (cluster_axis == 0) ? mesh_view->num_rows() : mesh_view->num_cols();
 
+    int16_t rank = input_tensor.get_logical_shape().rank();
+
+    int16_t scatter_dim = (dim < 0) ? rank + dim : dim;
+
+    TT_FATAL(scatter_dim >= -rank && scatter_dim <= rank - 1 , "Dimension input should be in between -{} and {}, but has {}", rank, rank - 1, dim);
+
     std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({input_tensor}))};
 
     operation::launch_op(
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp
index f26107cda30..57f5d055caa 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/device/reduce_scatter_op.hpp
@@ -69,7 +69,7 @@ namespace operations{
 namespace ccl{
 Tensor reduce_scatter(
     const Tensor &input_tensor,
-    const uint32_t scatter_split_dim,
+    const int32_t dim,
     ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum,
     const uint32_t num_links = 1,
     const MemoryConfig &output_mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,
@@ -79,7 +79,7 @@ Tensor reduce_scatter(
 
 Tensor reduce_scatter(
     const ttnn::Tensor &input_tensor,
-    const uint32_t scatter_dim,
+    const int32_t dim,
     const uint32_t cluster_axis,
     const MeshDevice& mesh_device,
     ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum,
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp
index f67b9d14601..d0208e20fe2 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/host/reduce_scatter_worker_builder.cpp
@@ -11,6 +11,8 @@
 #include "ttnn/cpp/ttnn/operations/ccl/common/uops/ccl_command.hpp"
 #include "ttnn/operations/ccl/ccl_common.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn {
 namespace ccl {
 namespace reduce_scatter_detail {
@@ -630,7 +632,7 @@ std::vector<uint32_t> ReduceScatterWorkerArgBuilder::generate_line_start_sender_
         static_cast<uint32_t>(this->op_config.get_input_tensor(0).buffer()->buffer_type()), // buffer type
         static_cast<uint32_t>(this->op_config.get_input_tensor(0).layout()), // page layout
         static_cast<uint32_t>(this->edm_termination_mode), // (EDM) termination mode
-        static_cast<uint32_t>(tt::CB::c_in0) // cb_id
+        static_cast<uint32_t>(tt::CBIndex::c_0) // cb_id
     };
 
     auto const& input_tensor = this->op_config.get_input_tensor(0);
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
index ea28f4bd932..027b159d8f8 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.cpp
@@ -10,7 +10,7 @@ namespace ttnn::operations::ccl {
 
 ttnn::Tensor ExecuteReduceScatter::invoke(
     const ttnn::Tensor& input_tensor,
-    const uint32_t scatter_dim,
+    const int32_t dim,
     ttnn::operations::reduction::ReduceType math_op,
     const uint32_t num_links,
     const std::optional<ttnn::MemoryConfig>& memory_config,
@@ -19,11 +19,11 @@ ttnn::Tensor ExecuteReduceScatter::invoke(
     const std::optional<size_t> num_buffers_per_channel) {
 
     MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config());
-    return ttnn::operations::ccl::reduce_scatter(input_tensor, scatter_dim, math_op, num_links, out_memory_config, topology, num_workers, num_buffers_per_channel);
+    return ttnn::operations::ccl::reduce_scatter(input_tensor, dim, math_op, num_links, out_memory_config, topology, num_workers, num_buffers_per_channel);
 }
 ttnn::Tensor ExecuteReduceScatter::invoke(
     const ttnn::Tensor& input_tensor,
-    const uint32_t scatter_dim,
+    const int32_t dim,
     const uint32_t cluster_axis,
     const MeshDevice& mesh_device,
     ttnn::operations::reduction::ReduceType math_op,
@@ -34,7 +34,7 @@ ttnn::Tensor ExecuteReduceScatter::invoke(
     const std::optional<size_t> num_buffers_per_channel) {
 
     MemoryConfig out_memory_config = memory_config.value_or(input_tensor.memory_config());
-    return ttnn::operations::ccl::reduce_scatter(input_tensor, scatter_dim, cluster_axis, mesh_device, math_op, num_links, out_memory_config, topology, num_workers, num_buffers_per_channel);
+    return ttnn::operations::ccl::reduce_scatter(input_tensor, dim, cluster_axis, mesh_device, math_op, num_links, out_memory_config, topology, num_workers, num_buffers_per_channel);
 }
 
 }  // namespace ttnn::operations::ccl
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
index b7acc80e794..044af18777c 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter.hpp
@@ -17,7 +17,7 @@ namespace ccl {
 struct ExecuteReduceScatter {
     static ttnn::Tensor invoke(
         const Tensor &input_tensor,
-        const uint32_t scatter_dim,
+        const int32_t dim,
         const uint32_t cluster_axis,
         const MeshDevice& mesh_device,
         ttnn::operations::reduction::ReduceType reduce_op = ttnn::operations::reduction::ReduceType::Sum,
@@ -29,7 +29,7 @@ struct ExecuteReduceScatter {
 
     static ttnn::Tensor invoke(
         const ttnn::Tensor& input_tensor,
-        const uint32_t scatter_dim,
+        const int32_t dim,
         ttnn::operations::reduction::ReduceType math_op,
         const uint32_t num_links = 1,
         const std::optional<ttnn::MemoryConfig>& memory_config = std::nullopt,
diff --git a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
index bfac2f9a1d1..011c217ff5a 100644
--- a/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/ccl/reduce_scatter/reduce_scatter_pybind.cpp
@@ -26,17 +26,17 @@ void bind_reduce_scatter(pybind11::module& module, const ccl_operation_t& operat
         ttnn::pybind_overload_t{
             [](const ccl_operation_t& self,
                const ttnn::Tensor& input_tensor,
-               const uint32_t scatter_dim,
+               const int32_t dim,
                ttnn::operations::reduction::ReduceType math_op,
                const uint32_t num_links,
                const ttnn::MemoryConfig& memory_config,
                ttnn::ccl::Topology topology,
                const std::optional<size_t> num_workers,
                const std::optional<size_t> num_buffers_per_channel) -> ttnn::Tensor {
-                return self(input_tensor, scatter_dim, math_op, num_links, memory_config, topology, num_workers, num_buffers_per_channel);
+                return self(input_tensor, dim, math_op, num_links, memory_config, topology, num_workers, num_buffers_per_channel);
             },
             py::arg("input_tensor"),
-            py::arg("scatter_dim"),
+            py::arg("dim"),
             py::arg("math_op"),
             py::kw_only(),
             py::arg("num_links") = 1,
@@ -48,7 +48,7 @@ void bind_reduce_scatter(pybind11::module& module, const ccl_operation_t& operat
         ttnn::pybind_overload_t{
             [](const ccl_operation_t& self,
                const ttnn::Tensor& input_tensor,
-               const uint32_t scatter_dim,
+               const int32_t dim,
                const uint32_t cluster_axis,
                const MeshDevice& mesh_device,
                ttnn::operations::reduction::ReduceType math_op,
@@ -57,10 +57,10 @@ void bind_reduce_scatter(pybind11::module& module, const ccl_operation_t& operat
                const std::optional<size_t> num_workers,
                const std::optional<size_t> num_buffers_per_channel,
                const ttnn::ccl::Topology topology) -> ttnn::Tensor {
-                return self(input_tensor, scatter_dim, cluster_axis, mesh_device, math_op, num_links, output_mem_config, topology, num_workers, num_buffers_per_channel);
+                return self(input_tensor, dim, cluster_axis, mesh_device, math_op, num_links, output_mem_config, topology, num_workers, num_buffers_per_channel);
             },
             py::arg("input_tensor"),
-            py::arg("scatter_dim"),
+            py::arg("dim"),
             py::arg("cluster_axis"),
             py::arg("mesh_device"),
             py::arg("math_op"),
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
index cffa1308549..d2334f83ac7 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
@@ -19,19 +19,19 @@ using namespace tt;
 
 namespace {
 namespace CMAKE_UNIQUE_NAMESPACE {
-const uint32_t act_cb = CB::c_in0;
-const uint32_t weight_cb = CB::c_in1;
-const uint32_t bias_cb = CB::c_in2;
-const uint32_t sharded_act_cb = CB::c_in3;
-const uint32_t cb_for_reader_indices = CB::c_in4;
-const uint32_t cb_for_l1_array = CB::c_in5;
-const uint32_t act_cb_row_major_bfloat16 = CB::c_in6;
-const uint32_t act_cb_second_reader = CB::c_in7;
-const uint32_t matmul_partials_cb = CB::c_intermed0;
-const uint32_t tilize_mode_tilized_act_cb = CB::c_intermed1;
-const uint32_t untilize_mode_reblock_cb = CB::c_intermed2;
-const uint32_t out0_cb = CB::c_out0;
-const uint32_t temp_sum_cb = CB::c_intermed3;
+const uint32_t act_cb = CBIndex::c_0;
+const uint32_t weight_cb = CBIndex::c_1;
+const uint32_t bias_cb = CBIndex::c_2;
+const uint32_t sharded_act_cb = CBIndex::c_3;
+const uint32_t cb_for_reader_indices = CBIndex::c_4;
+const uint32_t cb_for_l1_array = CBIndex::c_5;
+const uint32_t act_cb_row_major_bfloat16 = CBIndex::c_6;
+const uint32_t act_cb_second_reader = CBIndex::c_7;
+const uint32_t matmul_partials_cb = CBIndex::c_24;
+const uint32_t tilize_mode_tilized_act_cb = CBIndex::c_25;
+const uint32_t untilize_mode_reblock_cb = CBIndex::c_26;
+const uint32_t out0_cb = CBIndex::c_16;
+const uint32_t temp_sum_cb = CBIndex::c_27;
 }
 }
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
index d74a1957d06..3435bfd197d 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp
@@ -42,18 +42,18 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl(
     bool enable_split_reader,
     bool enable_subblock_padding) {
 
-    const uint32_t act_cb = CB::c_in0;
-    const uint32_t weight_cb = CB::c_in1;
-    const uint32_t bias_cb = CB::c_in2;
-    const uint32_t sharded_act_cb = CB::c_in3;
-    const uint32_t cb_for_reader_indices = CB::c_in4;
-    const uint32_t cb_for_l1_array = CB::c_in5;
-    const uint32_t act_cb_row_major_bfloat16 = CB::c_in6;
-    const uint32_t act_cb_second_reader = CB::c_in7;
-    const uint32_t matmul_partials_cb = CB::c_intermed0;
-    const uint32_t tilize_mode_tilized_act_cb = CB::c_intermed1;
-    const uint32_t untilize_mode_reblock_cb = CB::c_intermed2;
-    const uint32_t out0_cb = CB::c_out0;
+    const uint32_t act_cb = CBIndex::c_0;
+    const uint32_t weight_cb = CBIndex::c_1;
+    const uint32_t bias_cb = CBIndex::c_2;
+    const uint32_t sharded_act_cb = CBIndex::c_3;
+    const uint32_t cb_for_reader_indices = CBIndex::c_4;
+    const uint32_t cb_for_l1_array = CBIndex::c_5;
+    const uint32_t act_cb_row_major_bfloat16 = CBIndex::c_6;
+    const uint32_t act_cb_second_reader = CBIndex::c_7;
+    const uint32_t matmul_partials_cb = CBIndex::c_24;
+    const uint32_t tilize_mode_tilized_act_cb = CBIndex::c_25;
+    const uint32_t untilize_mode_reblock_cb = CBIndex::c_26;
+    const uint32_t out0_cb = CBIndex::c_16;
 
     bool pass = true;
     enable_split_reader = false;
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/activation_reader_width_sharded.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/activation_reader_width_sharded.cpp
index 81e5a8250f8..b3d9ff9e498 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/activation_reader_width_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/activation_reader_width_sharded.cpp
@@ -91,18 +91,18 @@ void kernel_main() {
     //Equivalent to Core Index.
     uint32_t this_core_id = this_core_x + (num_cores_x * this_core_y) ;
 
-    constexpr uint32_t cb_id_act = tt::CB::c_in0;
-    constexpr uint32_t cb_id_weight = tt::CB::c_in1;
+    constexpr uint32_t cb_id_act = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_weight = tt::CBIndex::c_1;
 
-    constexpr uint32_t tilized_in0_cb_id = tt::CB::c_intermed1;
-    constexpr uint32_t cb_id_sharded_act = tt::CB::c_in3;
-    constexpr uint32_t cb_id_act_row_major_bfloat16 = tt::CB::c_in6;
+    constexpr uint32_t tilized_in0_cb_id = tt::CBIndex::c_25;
+    constexpr uint32_t cb_id_sharded_act = tt::CBIndex::c_3;
+    constexpr uint32_t cb_id_act_row_major_bfloat16 = tt::CBIndex::c_6;
 
-    constexpr uint32_t cb_reader_indices = tt::CB::c_in4;
+    constexpr uint32_t cb_reader_indices = tt::CBIndex::c_4;
     volatile tt_l1_ptr uint32_t* packed_reader_indices_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_write_ptr(cb_reader_indices));
 
     // L1 array
-    constexpr uint32_t cb_l1_array = tt::CB::c_in5;
+    constexpr uint32_t cb_l1_array = tt::CBIndex::c_5;
     volatile tt_l1_ptr uint32_t* act_mcast_sender_semaphore_valid_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_write_ptr(cb_l1_array));
 
     // Set up local VALID value, to be mcasted to destinations flag address after the data has been mcasted
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/compute_depthwise_conv1d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/compute_depthwise_conv1d.cpp
index 7f2195ccb8b..ca55ac53340 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/compute_depthwise_conv1d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/compute_depthwise_conv1d.cpp
@@ -119,15 +119,15 @@ void MAIN {
     constexpr uint32_t out_block_w = in1_block_w;
 
     // CB indices
-    constexpr uint32_t in0_cb_id                                = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id                                = tt::CB::c_in1;
-    constexpr uint32_t in0_pretilize_cb_id                      = tt::CB::c_in6;
-    constexpr uint32_t in0_cb_second_reader_id                  = tt::CB::c_in7;
-    constexpr uint32_t eltwise_mul_partials_cb                  = tt::CB::c_intermed0;
-    constexpr uint32_t tilized_in0_cb_id                        = tt::CB::c_intermed1;
-    constexpr uint32_t temp_sum_cb                              = tt::CB::c_intermed3;
-    constexpr uint32_t prev_eltwise_cb                          = tt::CB::c_intermed5;
-    constexpr uint32_t out_cb_id                                = tt::CB::c_out0;
+    constexpr uint32_t in0_cb_id                                = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id                                = tt::CBIndex::c_1;
+    constexpr uint32_t in0_pretilize_cb_id                      = tt::CBIndex::c_6;
+    constexpr uint32_t in0_cb_second_reader_id                  = tt::CBIndex::c_7;
+    constexpr uint32_t eltwise_mul_partials_cb                  = tt::CBIndex::c_24;
+    constexpr uint32_t tilized_in0_cb_id                        = tt::CBIndex::c_25;
+    constexpr uint32_t temp_sum_cb                              = tt::CBIndex::c_27;
+    constexpr uint32_t prev_eltwise_cb                          = tt::CBIndex::c_29;
+    constexpr uint32_t out_cb_id                                = tt::CBIndex::c_16;
 
     constexpr uint32_t in0_num_subblocks_read = in0_num_subblocks;
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp
index 2a07817f5d8..7f6527c5280 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/conv_bmm_tilize_col_major_out_blocks.cpp
@@ -115,20 +115,20 @@ void MAIN {
     constexpr bool spill = in0_num_blocks_w > 1;
 
     // CB indices
-    constexpr uint32_t in0_cb_id                                = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id                                = tt::CB::c_in1;
-    constexpr uint32_t in0_pretilize_cb_id                      = tt::CB::c_in6;
-    constexpr uint32_t in0_cb_second_reader_id                  = tt::CB::c_in7;
-    constexpr uint32_t matmul_partials_cb                       = tt::CB::c_intermed0;
-    constexpr uint32_t tilized_in0_cb_id                        = tt::CB::c_intermed1;
-    //constexpr uint32_t untilize_mode_reblock_cb                 = tt::CB::c_intermed2;
-    constexpr uint32_t out_cb_id                                = tt::CB::c_out0;
+    constexpr uint32_t in0_cb_id                                = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id                                = tt::CBIndex::c_1;
+    constexpr uint32_t in0_pretilize_cb_id                      = tt::CBIndex::c_6;
+    constexpr uint32_t in0_cb_second_reader_id                  = tt::CBIndex::c_7;
+    constexpr uint32_t matmul_partials_cb                       = tt::CBIndex::c_24;
+    constexpr uint32_t tilized_in0_cb_id                        = tt::CBIndex::c_25;
+    //constexpr uint32_t untilize_mode_reblock_cb                 = tt::CBIndex::c_26;
+    constexpr uint32_t out_cb_id                                = tt::CBIndex::c_16;
 
     constexpr uint32_t untilize_mode_out_cb_id = untilize_out ? matmul_partials_cb : out_cb_id;
 
     #ifdef FUSE_BIAS
     constexpr uint32_t bias_ntiles_w = get_compile_time_arg_val(16);
-    constexpr uint32_t bias_cb_id                           = tt::CB::c_in2;
+    constexpr uint32_t bias_cb_id                           = tt::CBIndex::c_2;
     uint32_t bias_block_offset = 0;
     constexpr uint32_t mm_out_cb_id = matmul_partials_cb;
     #else
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_2d_mcast_padded_with_halo_3x3_weights_v2.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_2d_mcast_padded_with_halo_3x3_weights_v2.cpp
index c37ee71bcd9..9edefcf2956 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_2d_mcast_padded_with_halo_3x3_weights_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_2d_mcast_padded_with_halo_3x3_weights_v2.cpp
@@ -100,16 +100,16 @@ void kernel_main() {
 
     tt_l1_ptr uint32_t *act_mcast_sender_noc_y  = (tt_l1_ptr uint32_t*)(get_arg_addr(i));
 
-    constexpr uint32_t cb_id_act = tt::CB::c_in0;
-    constexpr uint32_t tilized_in0_cb_id = tt::CB::c_intermed1;
-    constexpr uint32_t cb_id_sharded_act = tt::CB::c_in3;
-    constexpr uint32_t cb_id_act_row_major_bfloat16 = tt::CB::c_in6;
+    constexpr uint32_t cb_id_act = tt::CBIndex::c_0;
+    constexpr uint32_t tilized_in0_cb_id = tt::CBIndex::c_25;
+    constexpr uint32_t cb_id_sharded_act = tt::CBIndex::c_3;
+    constexpr uint32_t cb_id_act_row_major_bfloat16 = tt::CBIndex::c_6;
 
-    constexpr uint32_t cb_reader_indices = tt::CB::c_in4;
+    constexpr uint32_t cb_reader_indices = tt::CBIndex::c_4;
     volatile tt_l1_ptr uint32_t* packed_reader_indices_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_write_ptr(cb_reader_indices));
 
     // L1 array
-    constexpr uint32_t cb_l1_array = tt::CB::c_in5;
+    constexpr uint32_t cb_l1_array = tt::CBIndex::c_5;
     volatile tt_l1_ptr uint32_t* l1_array = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_write_ptr(cb_l1_array));
     // Set up local VALID value, to be mcasted to destinations flag address after the data has been mcasted
     volatile tt_l1_ptr uint32_t* act_mcast_sender_semaphore_valid_addr_ptr = &l1_array[0];
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_padded_with_halo_3x3_weights_v2.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_padded_with_halo_3x3_weights_v2.cpp
index c6b91278153..80e6b189feb 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_padded_with_halo_3x3_weights_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_conv_activations_padded_with_halo_3x3_weights_v2.cpp
@@ -65,7 +65,7 @@ void kernel_main() {
     constexpr uint32_t act_block_num_tiles_read = act_block_num_tiles;
 
     // LOOP TO FILL READER INDICES
-    constexpr uint32_t cb_reader_indices = tt::CB::c_in4;
+    constexpr uint32_t cb_reader_indices = tt::CBIndex::c_4;
     volatile tt_l1_ptr uint32_t* packed_reader_indices_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_write_ptr(cb_reader_indices));
 
     uint32_t reader_idx = 0;
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_depthwise_conv1d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_depthwise_conv1d.cpp
index fcb3aa958bf..6d54ab8372c 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_depthwise_conv1d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_depthwise_conv1d.cpp
@@ -68,7 +68,7 @@ void kernel_main() {
     #endif
 
     // LOOP TO FILL READER INDICES
-    constexpr uint32_t cb_reader_indices = tt::CB::c_in4;
+    constexpr uint32_t cb_reader_indices = tt::CBIndex::c_4;
     volatile tt_l1_ptr uint32_t* packed_reader_indices_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_write_ptr(cb_reader_indices));
 
     uint32_t reader_idx = 0;
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp
index 030eb482cb0..e7ef967748c 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_receiver_conv_weights_tiled_col_to_rm_blocks.cpp
@@ -91,7 +91,7 @@ void kernel_main() {
     constexpr uint32_t act_block_h_datums_read = act_block_h_datums / 2; // Extra /2 because of packed uint16 reads
     constexpr uint32_t act_block_num_tiles_read = act_block_num_tiles;
 
-    constexpr uint32_t cb_reader_indices = tt::CB::c_in4;
+    constexpr uint32_t cb_reader_indices = tt::CBIndex::c_4;
     volatile tt_l1_ptr uint32_t* packed_reader_indices_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_write_ptr(cb_reader_indices));
     uint32_t reader_idx = 0;
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp
index 7bcb357c184..61898fed016 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/kernels/reader_writer_tiled_out_1d_mcast_sender_conv_weights_tiled_col_to_rm_blocks.cpp
@@ -94,7 +94,7 @@ void kernel_main() {
     constexpr uint32_t act_block_num_tiles_read = act_block_num_tiles;
 
 
-    constexpr uint32_t cb_reader_indices = tt::CB::c_in4;
+    constexpr uint32_t cb_reader_indices = tt::CBIndex::c_4;
     volatile tt_l1_ptr uint32_t* packed_reader_indices_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(get_write_ptr(cb_reader_indices));
     uint32_t reader_idx = 0;
 
diff --git a/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp b/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
index d075f2fca24..d906185f686 100644
--- a/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
+++ b/ttnn/cpp/ttnn/operations/core/work_split/work_split_tilize.hpp
@@ -177,7 +177,7 @@ struct FullRep {
 };
 
 inline std::vector<std::vector<BlockRep>> distribute_work(
-    const ttnn::SimpleShape& logical_shape, const Padding& padding, uint32_t num_cores, uint32_t blocks_per_core, bool has_cliff, uint32_t nblocks_per_core_cliff) {
+    const ttnn::SimpleShape& logical_shape, const tt::tt_metal::Padding& padding, uint32_t num_cores, uint32_t blocks_per_core, bool has_cliff, uint32_t nblocks_per_core_cliff) {
     TT_FATAL(logical_shape.rank() >= 2 && logical_shape.rank() <= 4, "Only 2D, 3D, and 4D tensors are supported. Shape: {}", "Error", logical_shape, padding);
 
     auto input_w = logical_shape.rank() >= 4 ? logical_shape[-4] : 1;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.cpp
index 3c38f88ed64..56000fa08e0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.cpp
@@ -9,6 +9,7 @@
 // using namespace tt;
 // using namespace tt_metal;
 // using namespace constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp
index a2a56b717f3..f66af90e100 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/bcast_device_operation.hpp
@@ -25,17 +25,17 @@ enum class BcastOpParallelizationStrategy {
 struct EltwiseBinaryBroadcast {
     const ttnn::BcastOpMath math_op;
     const ttnn::BcastOpDim dim;
-    const MemoryConfig output_mem_config;
+    const tt::tt_metal::MemoryConfig output_mem_config;
     const bool in_place;
 
     void validate_with_output_tensors(const std::vector<Tensor> &input_tensors, const std::vector<std::optional<Tensor>> &output_tensors) const;
     std::vector<ttnn::SimpleShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
     std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors, const std::vector<std::optional<Tensor>> &output_tensors) const;
-    operation::ProgramWithCallbacks create_program(
+    tt::tt_metal::operation::ProgramWithCallbacks create_program(
         const std::vector<Tensor> &input_tensors, std::vector<Tensor> &output_tensors) const;
     BcastOpParallelizationStrategy get_parallelization_strategy(const std::vector<Tensor> &input_tensors) const;
 
-    const operation::Hash compute_program_hash(const std::vector<Tensor> &input_tensors) const;
+    const tt::tt_metal::operation::Hash compute_program_hash(const std::vector<Tensor> &input_tensors) const;
 };
 
 } // ttnn::operations::data_movement
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h.cpp
index 1220f3e935d..0d5e2230d28 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h.cpp
@@ -12,7 +12,7 @@ void MAIN {
     uint32_t B = get_arg_val<uint32_t>(0);
     uint32_t Ht = get_arg_val<uint32_t>(1);
     uint32_t Wt = get_arg_val<uint32_t>(2);
-    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1);
 
     for (uint32_t b = 0; b < B; b++) {
     for (uint32_t h = 0; h < Ht; h++) {
@@ -20,23 +20,23 @@ void MAIN {
         // For this bcast-h op the reader will wrap the RHS source tile around at Wt
         // so here we just linearly read 2 parallel arrays and apply bcast op per tile
         // (bcast_h propagates the op down the H dimension, so it can be though of as bcast to H)
-        cb_wait_front(tt::CB::c_in1, onetile);
+        cb_wait_front(tt::CBIndex::c_1, onetile);
 
-        cb_reserve_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
 
         acquire_dst();
 
-        cb_wait_front(tt::CB::c_in0, onetile);
+        cb_wait_front(tt::CBIndex::c_0, onetile);
 
-        BCAST_OP<BroadcastType::ROW>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
-        pack_tile(0, tt::CB::c_out0);
+        BCAST_OP<BroadcastType::ROW>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_16);
 
-        cb_pop_front(tt::CB::c_in0, onetile);
+        cb_pop_front(tt::CBIndex::c_0, onetile);
 
         release_dst();
 
-        cb_push_back(tt::CB::c_out0, onetile);
-        cb_pop_front(tt::CB::c_in1, onetile);
+        cb_push_back(tt::CBIndex::c_16, onetile);
+        cb_pop_front(tt::CBIndex::c_1, onetile);
     } } }
 }
 } // NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h_sharded_optimised.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h_sharded_optimised.cpp
index 2d340e9a021..0b454e36fc4 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h_sharded_optimised.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h_sharded_optimised.cpp
@@ -16,28 +16,28 @@ void MAIN {
     uint32_t batch_b = get_arg_val<uint32_t>(4);
     uint32_t Ht_per_batch_b = get_arg_val<uint32_t>(5);
 
-    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1, tt::CB::c_out0);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_16);
 
-    cb_wait_front(tt::CB::c_in0, Wt*Ht);
-    cb_reserve_back(tt::CB::c_out0, Wt*Ht);
+    cb_wait_front(tt::CBIndex::c_0, Wt*Ht);
+    cb_reserve_back(tt::CBIndex::c_16, Wt*Ht);
     uint32_t b_offset = 0;
     for (uint32_t bn = 0; bn < batch_b; bn++) {
         for (uint32_t wt = 0; wt < Wt; wt++) {
-            cb_wait_front(tt::CB::c_in1, onetile);
+            cb_wait_front(tt::CBIndex::c_1, onetile);
             for (uint32_t ht = 0; ht < Ht_per_batch_b; ht+=h_blk) {
                 acquire_dst();
                 for (uint32_t htr = 0; htr<h_blk; htr++) {
                     uint32_t current_index = b_offset + (ht + htr) * Wt + wt;
-                    BCAST_OP<BroadcastType::ROW>(tt::CB::c_in0, tt::CB::c_in1, current_index, 0, htr);
-                    pack_tile<true>(htr, tt::CB::c_out0, current_index);
+                    BCAST_OP<BroadcastType::ROW>(tt::CBIndex::c_0, tt::CBIndex::c_1, current_index, 0, htr);
+                    pack_tile<true>(htr, tt::CBIndex::c_16, current_index);
                 }
                 release_dst();
             }
-            cb_pop_front(tt::CB::c_in1, onetile);
+            cb_pop_front(tt::CBIndex::c_1, onetile);
         }
         b_offset += Ht_per_batch_b * Wt;
     }
-    cb_pop_front(tt::CB::c_in0, Wt*Ht);
-    cb_push_back(tt::CB::c_out0, Wt*Ht);
+    cb_pop_front(tt::CBIndex::c_0, Wt*Ht);
+    cb_push_back(tt::CBIndex::c_16, Wt*Ht);
 }
 } // NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_hw.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_hw.cpp
index 499afa82fad..30965b343f1 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_hw.cpp
@@ -12,34 +12,34 @@ void MAIN {
     uint32_t B = get_arg_val<uint32_t>(0);
     uint32_t Ht = get_arg_val<uint32_t>(1);
     uint32_t Wt = get_arg_val<uint32_t>(2);
-    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1);
 
     #ifdef BCAST_SCALAR
-    cb_wait_front(tt::CB::c_in1, onetile);
+    cb_wait_front(tt::CBIndex::c_1, onetile);
     #endif
 
     for (uint32_t b = 0; b < B; b++) {
     for (uint32_t h = 0; h < Ht; h++) {
     for (uint32_t w = 0; w < Wt; w++) {
         #ifndef BCAST_SCALAR
-        cb_wait_front(tt::CB::c_in1, onetile);
+        cb_wait_front(tt::CBIndex::c_1, onetile);
         #endif
-        cb_reserve_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
 
         acquire_dst();
 
-        cb_wait_front(tt::CB::c_in0, onetile);
+        cb_wait_front(tt::CBIndex::c_0, onetile);
 
-        BCAST_OP<BroadcastType::SCALAR>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
-        pack_tile(0, tt::CB::c_out0);
+        BCAST_OP<BroadcastType::SCALAR>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_16);
 
-        cb_pop_front(tt::CB::c_in0, onetile);
+        cb_pop_front(tt::CBIndex::c_0, onetile);
         #ifndef BCAST_SCALAR
-        cb_pop_front(tt::CB::c_in1, onetile);
+        cb_pop_front(tt::CBIndex::c_1, onetile);
         #endif
         release_dst();
 
-        cb_push_back(tt::CB::c_out0, onetile);
+        cb_push_back(tt::CBIndex::c_16, onetile);
     } } }
 
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_w.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_w.cpp
index ec6f71c0023..1f79464d180 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_w.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_w.cpp
@@ -14,28 +14,28 @@ void MAIN {
     uint32_t Ht = get_arg_val<uint32_t>(1);
     uint32_t Wt = get_arg_val<uint32_t>(2);
 
-    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CB::c_in0, tt::CB::c_in1);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1);
 
     for (uint32_t b = 0; b < B; b++) {
     for (uint32_t h = 0; h < Ht; h++) {
-        cb_wait_front(tt::CB::c_in1, onetile);
+        cb_wait_front(tt::CBIndex::c_1, onetile);
         for (uint32_t w = 0; w < Wt; w++) {
 
-            cb_reserve_back(tt::CB::c_out0, onetile);
+            cb_reserve_back(tt::CBIndex::c_16, onetile);
 
             acquire_dst();
 
-            cb_wait_front(tt::CB::c_in0, onetile);
-            BCAST_OP<BroadcastType::COL>(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
-            pack_tile(0, tt::CB::c_out0);
-            cb_pop_front(tt::CB::c_in0, onetile);
+            cb_wait_front(tt::CBIndex::c_0, onetile);
+            BCAST_OP<BroadcastType::COL>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+            pack_tile(0, tt::CBIndex::c_16);
+            cb_pop_front(tt::CBIndex::c_0, onetile);
 
             release_dst();
 
-            cb_push_back(tt::CB::c_out0, onetile);
+            cb_push_back(tt::CBIndex::c_16, onetile);
 
         }
-        cb_pop_front(tt::CB::c_in1, onetile);
+        cb_pop_front(tt::CBIndex::c_1, onetile);
     }}
 }
 } // NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.cpp
index 3d1358992c2..b5217434bf7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_multi_core_h.cpp
@@ -78,7 +78,7 @@ operation::ProgramWithCallbacks bcast_multi_core_h(const Tensor &a, const Tensor
 		.set_page_size(src1_cb_index, src1_single_tile_size);
 	auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, src1_cb_config);
 
-	uint32_t output_cb_index = 16; // output operands start at index 16
+	uint32_t output_cb_index = tt::CBIndex::c_16;
 	uint32_t num_output_tiles = 2;
 	tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
 		.set_page_size(output_cb_index, dst_single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.cpp
index 58feb2f1b3b..f9dc21601e5 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h.cpp
@@ -69,7 +69,7 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b
 
     TT_ASSERT((shard_spec.shape[0] % TILE_HEIGHT == 0) && (shard_spec.shape[0] % TILE_WIDTH == 0), "Shard shapes must be multiple of TILE_HEIGHT ");
 
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     uint32_t aligned_input_tile_nbytes = round_up_to_mul32(input_tile_size); //will have issue if the page is not multiple of 32
     uint32_t in_cb_pagesize = aligned_input_tile_nbytes;
     tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(aligned_input_tile_nbytes * num_tile_per_core,  {{src0_cb_index, act_df}})
@@ -77,7 +77,7 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b
                                           .set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, src0_cb_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(aligned_input_tile_nbytes * num_tile_per_core,
                                           {{output_cb_index, out_df}})
                                           .set_page_size(output_cb_index, in_cb_pagesize)
@@ -85,7 +85,7 @@ operation::ProgramWithCallbacks bcast_sharded_h(const Tensor &a, const Tensor &b
     auto out_cb = tt_metal::CreateCircularBuffer(program, all_cores, output_cb_config);
 
     uint32_t num_input_tiles = (b.get_legacy_shape()[-1] * output.element_size() + TILE_HW - 1)/ TILE_HW;
-    uint32_t src1_cb_index = CB::c_in1;
+    uint32_t src1_cb_index = CBIndex::c_1;
     tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * input1_tile_size, {{src1_cb_index, b_df}})
         .set_page_size(src1_cb_index, input1_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, src1_cb_config);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.cpp
index 92186bb708a..f98f3350e33 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_h/bcast_op_sharded_h_optimised.cpp
@@ -69,7 +69,7 @@ operation::ProgramWithCallbacks bcast_sharded_h_optimised(const Tensor &a, const
 
     TT_ASSERT((shard_spec.shape[0] % TILE_HEIGHT == 0) && (shard_spec.shape[0] % TILE_WIDTH == 0), "Shard shapes must be multiple of TILE_HEIGHT ");
 
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     uint32_t aligned_input_tile_nbytes = round_up_to_mul32(input_tile_size); //will have issue if the page is not multiple of 32
     uint32_t in_cb_pagesize = aligned_input_tile_nbytes;
     tt::tt_metal::CircularBufferConfig src0_cb_config = tt::tt_metal::CircularBufferConfig(aligned_input_tile_nbytes * num_tile_per_core,  {{src0_cb_index, act_df}})
@@ -77,7 +77,7 @@ operation::ProgramWithCallbacks bcast_sharded_h_optimised(const Tensor &a, const
                                           .set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, src0_cb_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig output_cb_config = tt::tt_metal::CircularBufferConfig(aligned_input_tile_nbytes * num_tile_per_core,
                                           {{output_cb_index, out_df}})
                                           .set_page_size(output_cb_index, in_cb_pagesize)
@@ -88,7 +88,7 @@ operation::ProgramWithCallbacks bcast_sharded_h_optimised(const Tensor &a, const
     uint32_t w_blk = std::min(Wt, 8u);
 
     uint32_t num_input_tiles = w_blk;
-    uint32_t src1_cb_index = CB::c_in1;
+    uint32_t src1_cb_index = CBIndex::c_1;
     tt::tt_metal::CircularBufferConfig src1_cb_config = tt::tt_metal::CircularBufferConfig(num_input_tiles * input1_tile_size, {{src1_cb_index, b_df}})
         .set_page_size(src1_cb_index, input1_tile_size);
     auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, src1_cb_config);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
index 75617bad384..e032ce37c92 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp
@@ -14,7 +14,7 @@
 
 using namespace	tt;
 using namespace constants;
-
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tensor &b, const Tensor& output, BcastOpMath bcast_math, bool inplace) {
@@ -98,7 +98,7 @@ operation::ProgramWithCallbacks bcast_multi_core_hw(const Tensor &a, const Tenso
         .set_page_size(src1_cb_index, src1_single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, src1_cb_config);
 
-    uint32_t output_cb_index = 16; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = output_sharded ? num_tiles_per_shard : 2;
     tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
         .set_page_size(output_cb_index, dst_single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.cpp
index b628edaf8e0..9efe8ba1677 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_w/bcast_op_multi_core_w.cpp
@@ -76,7 +76,7 @@ operation::ProgramWithCallbacks bcast_multi_core_w(const Tensor &a, const Tensor
 		.set_page_size(src1_cb_index, src1_single_tile_size);
 	auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, src1_cb_config);
 
-	uint32_t output_cb_index = 16; // output operands start at index 16
+	uint32_t output_cb_index = tt::CBIndex::c_16;
 	uint32_t num_output_tiles = 2;
 	tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
 		.set_page_size(output_cb_index, dst_single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
index 1e36a88de03..0de80c507d6 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/clone/device/clone_program_factory.cpp
@@ -36,7 +36,7 @@ CloneOperation::ProgramFactory::cached_program_t CloneOperation::ProgramFactory:
     auto [num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] =
         split_work_to_cores(compute_with_storage_grid_size, num_units);
 
-    uint32_t src_cb_id = CB::c_in4;
+    uint32_t src_cb_id = CBIndex::c_4;
     uint32_t aligned_input_unit_size = round_up_to_mul32(input_unit_size);
     auto src_cb_config = CircularBufferConfig(2 * aligned_input_unit_size, {{src_cb_id, input_data_format}})
                              .set_page_size(src_cb_id, aligned_input_unit_size);
@@ -44,7 +44,7 @@ CloneOperation::ProgramFactory::cached_program_t CloneOperation::ProgramFactory:
 
     uint32_t dst_cb_id = src_cb_id;
     if (convert_dtype) {
-        dst_cb_id = CB::c_out4;
+        dst_cb_id = CBIndex::c_20;
         uint32_t aligned_output_unit_size = round_up_to_mul32(output_unit_size);
         auto dst_cb_config = CircularBufferConfig(2 * aligned_output_unit_size, {{dst_cb_id, output_data_format}})
                                  .set_page_size(dst_cb_id, aligned_output_unit_size);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp
index dd7054b7b43..6d12efcb831 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.cpp
@@ -12,6 +12,8 @@
 #include "tt_metal/common/logger.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
index 241b0e1f8b2..2114c255774 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_program_factory.cpp
@@ -10,8 +10,9 @@
 #include "ttnn/cpp/ttnn/operations/data_movement/concat/device/concat_device_operation.hpp"
 #include "ttnn/tensor/tensor.hpp"
 
-using namespace tt::constants;
 using namespace tt;
+using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/copy.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/copy.cpp
index 6be441de8cc..cde3a9b0407 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/copy/copy.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/copy/copy.cpp
@@ -9,6 +9,8 @@
 #include "ttnn/decorators.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 ttnn::Tensor CopyOperation::invoke(uint8_t queue_id, const Tensor& src_tensor, const Tensor& dst_tensor) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.cpp
index f9dbd77b7a1..64e16145a94 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_device_operation.cpp
@@ -6,7 +6,7 @@
 #include "ttnn/tensor/tensor_utils.hpp"
 
 using namespace tt::constants;
-
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
index e1f52f4b6be..172e725189f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/copy/device/copy_program_factory.cpp
@@ -15,6 +15,7 @@
 #include "tt_metal/host_api.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
@@ -38,7 +39,7 @@ operation::ProgramWithCallbacks copy_multi_core(const Tensor &input, const Tenso
     uint32_t num_cores_y = compute_with_storage_grid_size.y;
     auto [num_cores, all_cores, core_group_1, core_group_2, num_units_per_core_group_1, num_units_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_units);
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_units = 2;
     uint32_t aligned_input_unit_size = round_up_to_mul32(input_unit_size);
     tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(num_input_units * aligned_input_unit_size, {{src0_cb_index, input_cb_data_format}})
@@ -47,7 +48,7 @@ operation::ProgramWithCallbacks copy_multi_core(const Tensor &input, const Tenso
 
     uint32_t output_cb_index = src0_cb_index; // same as input cb
     if (convert_dtype) {
-        output_cb_index = 16; // output operands start at index 16
+        output_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_units = 2;
         uint32_t aligned_output_unit_size = round_up_to_mul32(output_unit_size);
         tt::tt_metal::CircularBufferConfig output_cb_config = tt::tt_metal::CircularBufferConfig(num_output_units * aligned_output_unit_size, {{output_cb_index, output_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/data_movement/expand/device/expand_rm_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/expand/device/expand_rm_program_factory.cpp
index 22e0b56c426..a7ed3835042 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/expand/device/expand_rm_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/expand/device/expand_rm_program_factory.cpp
@@ -16,6 +16,8 @@
 #include "impl/kernels/kernel_types.hpp"
 #include "ttnn/tensor/types.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::expand {
 ExpandOperation::ExpandRowMajorFactory::cached_program_t ExpandOperation::ExpandRowMajorFactory::create(
     const operation_attributes_t& operation_attributes,
@@ -106,14 +108,14 @@ ExpandOperation::ExpandRowMajorFactory::cached_program_t ExpandOperation::Expand
     const auto sram_buffer_length = 32;
 
     // Scratch SRAM buffer
-    uint32_t scratch_buf_id = tt::CB::c_intermed0;
+    uint32_t scratch_buf_id = tt::CBIndex::c_24;
     auto scratch_config =
         CircularBufferConfig(unexpanded_row_size * sram_buffer_length, {{scratch_buf_id, data_format}})
             .set_page_size(scratch_buf_id, unexpanded_row_size);
     auto scratch_handle = CreateCircularBuffer(program, all_cores, scratch_config);
 
     // IO SRAM Buffer
-    uint32_t io_buf_id = tt::CB::c_out0;
+    uint32_t io_buf_id = tt::CBIndex::c_16;
     auto io_config = CircularBufferConfig(expanded_row_size * sram_buffer_length, {{io_buf_id, data_format}})
                          .set_page_size(io_buf_id, expanded_row_size);
     auto io_handle = CreateCircularBuffer(program, all_cores, io_config);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/device/fill_rm_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/device/fill_rm_op.cpp
index a3817137053..6b5189a3dac 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/device/fill_rm_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/device/fill_rm_op.cpp
@@ -9,6 +9,7 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/util.hpp"
 
+using namespace tt::tt_metal;
 
 using uint32_t = uint32_t;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp
index 4c4f93422e2..ee3c3dbd780 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fill_rm/fill_rm.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/decorators.hpp"
 #include "ttnn/common/constants.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement{
 
 ttnn::Tensor FillRMOperation::invoke(uint8_t queue_id, uint32_t N, uint32_t C, uint32_t H, uint32_t W, uint32_t hFill, uint32_t wFill, const ttnn::Tensor& any, float val_hi, float val_lo, const std::optional<ttnn::MemoryConfig>& memory_config) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp
index 71b9e0251e5..a4ab17d2ed4 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_multi_core_program_factory.cpp
@@ -10,6 +10,8 @@
 #include "fold_device_op.hpp"
 #include "ttnn/operations/math.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 Fold::MultiCore::cached_program_t fold_multi_core(
@@ -36,7 +38,7 @@ Fold::MultiCore::cached_program_t fold_multi_core(
     uint32_t pixels_per_dst_row = stride_h * width;
 
     // input CB
-    uint32_t cb_src0_index = tt::CB::c_in0;
+    uint32_t cb_src0_index = tt::CBIndex::c_0;
     uint32_t aligned_pixel_size = round_up_to_mul32(pixel_size);
     auto src_cb_config = CircularBufferConfig(num_pixels * aligned_pixel_size, {{cb_src0_index, cb_data_format}})
                              .set_page_size(cb_src0_index, aligned_pixel_size)
@@ -44,7 +46,7 @@ Fold::MultiCore::cached_program_t fold_multi_core(
     auto cb_src0 = CreateCircularBuffer(program, all_cores, src_cb_config);
 
     // output CB
-    uint32_t cb_dst0_index = tt::CB::c_out0;
+    uint32_t cb_dst0_index = tt::CBIndex::c_16;
     uint32_t aligned_dst_pixel_size = round_up_to_mul32(dst_pixel_size);
     auto dst_cb_config =
         CircularBufferConfig(num_dst_pixels * aligned_dst_pixel_size, {{cb_dst0_index, cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_op_multi_core.cpp
index b32501b8971..9d9cfef2d2d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_op_multi_core.cpp
@@ -36,7 +36,7 @@ cached_program_t fold_multi_core(
     uint32_t pixels_per_dst_row = stride_h * width;
 
     // input CB
-    uint32_t cb_src0_index = CB::c_in0;
+    uint32_t cb_src0_index = CBIndex::c_0;
     uint32_t aligned_pixel_size = round_up_to_mul32(pixel_size);
     auto src_cb_config = CircularBufferConfig(num_pixels * aligned_pixel_size, {{cb_src0_index, cb_data_format}})
                              .set_page_size(cb_src0_index, aligned_pixel_size)
@@ -44,7 +44,7 @@ cached_program_t fold_multi_core(
     auto cb_src0 = CreateCircularBuffer(program, all_cores, src_cb_config);
 
     // output CB
-    uint32_t cb_dst0_index = CB::c_out0;
+    uint32_t cb_dst0_index = CBIndex::c_16;
     uint32_t aligned_dst_pixel_size = round_up_to_mul32(dst_pixel_size);
     auto dst_cb_config =
         CircularBufferConfig(num_dst_pixels * aligned_dst_pixel_size, {{cb_dst0_index, cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_op_single_core.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_op_single_core.cpp
index 4d46904dd35..d1795c0d99f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_op_single_core.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_op_single_core.cpp
@@ -41,7 +41,7 @@ cached_program_t fold_single_core(
     bool dst_is_dram = (dst_buffer->buffer_type() == tt_metal::BufferType::DRAM);
 
     // Setup CB.
-    uint32_t cb_src0_index = CB::c_in0;
+    uint32_t cb_src0_index = CBIndex::c_0;
     uint32_t aligned_pixel_size = round_up_to_mul32(pixel_size);
     tt_metal::CircularBufferConfig cb_src0_config(
         2 * cb_pages_per_dst_row * aligned_pixel_size, {{cb_src0_index, cb_data_format}});
diff --git a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_single_core_program_factory.cpp
index 8e436a53879..ef79d36dff0 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/fold/device/fold_single_core_program_factory.cpp
@@ -10,6 +10,8 @@
 #include "fold_device_op.hpp"
 #include "ttnn/operations/math.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 Fold::SingleCore::cached_program_t fold_single_core(
@@ -41,7 +43,7 @@ Fold::SingleCore::cached_program_t fold_single_core(
     bool dst_is_dram = (dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM);
 
     // Setup CB.
-    uint32_t cb_src0_index = tt::CB::c_in0;
+    uint32_t cb_src0_index = tt::CBIndex::c_0;
     uint32_t aligned_pixel_size = round_up_to_mul32(pixel_size);
     tt::tt_metal::CircularBufferConfig cb_src0_config(
         2 * cb_pages_per_dst_row * aligned_pixel_size, {{cb_src0_index, cb_data_format}});
diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op.cpp
index a9ad9e278f0..9fefbc9f9e7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op.cpp
@@ -4,6 +4,7 @@
 
 #include "ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op.hpp"
 
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement{
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op_multi_core_program_factory.cpp
index 23953f71c96..f47ee8e58b8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op_multi_core_program_factory.cpp
@@ -11,6 +11,8 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_metal/detail/util.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 operation::ProgramWithCallbacks indexed_fill_multi_core(const Tensor &batch_ids, const Tensor &input_a, const Tensor & input_b, const Tensor &output) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp
index 7b062a35723..e798bb04c55 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/indexed_fill/indexed_fill.cpp
@@ -6,6 +6,8 @@
 #include "ttnn/operations/data_movement/indexed_fill/device/indexed_fill_op.hpp"
 #include "ttnn/common/constants.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement{
 
 ttnn::Tensor IndexedFillOperation::invoke(uint8_t queue_id, const ttnn::Tensor& batch_id, const ttnn::Tensor& input_tensor_a, const ttnn::Tensor& input_tensor_b, const std::optional<ttnn::MemoryConfig>& memory_config, int64_t dim) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.cpp b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.cpp
index 4c1bdbda8da..f6e2872c4ba 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.cpp
@@ -7,7 +7,7 @@
 #include "tt_metal/host_api.hpp"
 
 using namespace tt::constants;
-
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp
index 1ade8b3b2b8..3744e51f4df 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_device_operation.hpp
@@ -18,18 +18,18 @@ enum class MoveOpParallelizationStrategy {
 };
 
 struct MoveDeviceOperation {
-    const MemoryConfig output_mem_config;
+    const tt::tt_metal::MemoryConfig output_mem_config;
     const MoveOpParallelizationStrategy move_op_parallelization_strategy;
 
     void validate(const std::vector<Tensor> &input_tensors) const;
     std::vector<ttnn::SimpleShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
     std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
-    operation::ProgramWithCallbacks create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const;
+    tt::tt_metal::operation::ProgramWithCallbacks create_program(const std::vector<Tensor>& input_tensors, std::vector<Tensor> &output_tensors) const;
     MoveOpParallelizationStrategy get_parallelization_strategy(const std::vector<Tensor> &input_tensors) const;
 };
 
-operation::ProgramWithCallbacks move_multi_core(const Tensor &input, Tensor &output);
-operation::ProgramWithCallbacks move_multi_core_with_overlap(const Tensor &input, Tensor &output);
-operation::ProgramWithCallbacks move_multi_core_sharded(const Tensor &input, Tensor &output);
+tt::tt_metal::operation::ProgramWithCallbacks move_multi_core(const Tensor &input, Tensor &output);
+tt::tt_metal::operation::ProgramWithCallbacks move_multi_core_with_overlap(const Tensor &input, Tensor &output);
+tt::tt_metal::operation::ProgramWithCallbacks move_multi_core_sharded(const Tensor &input, Tensor &output);
 
 }  // namespace tt_metal
diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_program_factory.cpp
index 38bb07ff01c..ccf7c4c61b8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/move/device/move_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/move/device/move_program_factory.cpp
@@ -13,6 +13,7 @@
 #include <algorithm>
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
@@ -200,8 +201,8 @@ operation::ProgramWithCallbacks move_multi_core_sharded(const Tensor& input, Ten
     TT_FATAL(
         input_layout == output.get_layout() && input_dtype == output.get_dtype() &&
         shard_shape == output.shard_spec().value().shape && input_shape == output.get_legacy_shape(), "Error");
-    const uint32_t src_cb_sharded = tt::CB::c_in0;
-    const uint32_t dst_cb_sharded = tt::CB::c_in1;
+    const uint32_t src_cb_sharded = tt::CBIndex::c_0;
+    const uint32_t dst_cb_sharded = tt::CBIndex::c_1;
     uint32_t tile_size_bytes = tile_size(cb_data_format);
     uint32_t shard_shape_num_tiles = tt::div_up(shard_shape[0] * shard_shape[1], TILE_HEIGHT * TILE_WIDTH);
     uint32_t total_size_bytes = 0;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/move/move.cpp b/ttnn/cpp/ttnn/operations/data_movement/move/move.cpp
index 02698efa548..2ce22010e11 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/move/move.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/move/move.cpp
@@ -10,6 +10,8 @@
 #include "ttnn/run_operation.hpp"
 #include "ttnn/distributed/api.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 bool can_deallocate(const Tensor& input_tensor, bool from_multi_device = false) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_op.cpp
index 063c721d4bc..987465e7341 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_op.cpp
@@ -4,6 +4,8 @@
 
 #include "non_zero_indices_op.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn {
 
 namespace operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_program_factory.cpp
index ee7c35d2ab5..d6b7c075970 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/device/non_zero_indices_program_factory.cpp
@@ -13,6 +13,7 @@
 #include "tt_metal/detail/util.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp
index 8c3c49db946..307b40c8388 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/non_zero_indices/non_zero_indices.cpp
@@ -9,6 +9,8 @@
 #include "ttnn/decorators.hpp"
 #include "ttnn/common/constants.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved.cpp
index f9dfc0774be..adcde01c0f4 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved.cpp
@@ -51,7 +51,7 @@ void kernel_main() {
     constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
     #define src_stick_size_is_pow2 get_compile_time_arg_val(2) == 1
 
-    constexpr uint32_t cb_id = tt::CB::c_in0;
+    constexpr uint32_t cb_id = tt::CBIndex::c_0;
 
     // calculate the offset for alignment of padding in rows/sticks
     uint32_t l1_addr_partial = get_write_ptr(cb_id) + unpadded_X_nbytes;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp
index fa048c814f7..86cd1d7389b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp
@@ -54,8 +54,8 @@ void kernel_main() {
     #endif
 
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_pad = tt::CB::c_in1;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_pad = tt::CBIndex::c_1;
 
     #define stick_size_is_pow2 get_compile_time_arg_val(19) == 1
     #if (stick_size_is_pow2)
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_sharded.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_sharded.cpp
index 70fe5548ebf..3ce018f9953 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_sharded.cpp
@@ -20,8 +20,8 @@ void kernel_main() {
     tt_l1_ptr uint32_t * chunk_start_id           = (tt_l1_ptr uint32_t*)(get_arg_addr(1 + num_cores_read * 3));
     tt_l1_ptr uint32_t * chunk_num_sticks         = (tt_l1_ptr uint32_t*)(chunk_start_id + 1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
 
     cb_reserve_back(cb_out0, num_sticks_padded);
     uint32_t l1_read_addr = get_write_ptr(cb_in0);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved.cpp
index 4a03c1ea474..bd0b4ea8af6 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved.cpp
@@ -24,7 +24,7 @@ void kernel_main() {
     constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1;
     #define dst_stick_size_is_pow2 get_compile_time_arg_val(4) == 1
 
-    constexpr uint32_t cb_id = tt::CB::c_in0;
+    constexpr uint32_t cb_id = tt::CBIndex::c_0;
 
     // #if (dst_stick_size_is_pow2)
     //     constexpr uint32_t dst_log_base_2_of_page_size = get_compile_time_arg_val(5);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_sharded.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_sharded.cpp
index 47acf9ed970..912c31d79a5 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_sharded.cpp
@@ -68,8 +68,8 @@ void kernel_main() {
     #endif
 
 
-    constexpr auto cb_pad = tt::CB::c_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_pad = tt::CBIndex::c_1;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
 
     uint32_t pad_val_addr = get_read_ptr(cb_pad);
     uint64_t pad_val_noc_addr = get_noc_addr(pad_val_addr);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
index d6c151edd1e..ccd4eedeb5a 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
@@ -11,6 +11,7 @@
 #include "tt_log.h"
 #include "ttnn/operation.hpp"
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement::detail {
 
@@ -47,7 +48,7 @@ operation::ProgramWithCallbacks pad_rm_reader_writer(const Tensor &a,
     TT_FATAL(dst_buffer != nullptr, "Output buffer should be allocated on device!");
 
     CoreRange cores({0, 0}, {0, 0});
-    uint32_t cb_id = tt::CB::c_in0;
+    uint32_t cb_id = tt::CBIndex::c_0;
     uint32_t cb_npages = 16; // multibuffering
     uint32_t cb_pagesize = tt::round_up(padded_row_size_nbytes, std::max(src0_buffer->alignment(), tt::constants::TILE_WIDTH));
     tt::DataFormat in_df = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
@@ -683,7 +684,7 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core(const Tensor &a,
     Buffer *dst_buffer = output.buffer();
     TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
 
-    uint32_t cb_id = tt::CB::c_in0;
+    uint32_t cb_id = tt::CBIndex::c_0;
     uint32_t cb_npages = 16; // multibuffering for perf
     // uint32_t cb_npages = 1; // multibuffering for perf
     uint32_t cb_page_alignment = std::max(tt::constants::TILE_WIDTH, src0_buffer->alignment());
@@ -1405,7 +1406,7 @@ operation::ProgramWithCallbacks pad_rm_sharded(const Tensor &a,
         .set_page_size(src0_cb_index, stick_size_unpadded).set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
 
-    uint32_t output_cb_index = tt::CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig cb_output_config = tt::tt_metal::CircularBufferConfig(shard_height_padded * stick_size_padded, {{output_cb_index, dst_cb_data_format}})
         .set_page_size(output_cb_index, stick_size_padded).set_globally_allocated_address(*output.buffer());
     auto cb_output = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_output_config);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_rm.cpp
index 895312cc5b3..fda5c511bf8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/reader_permute_interleaved_rm.cpp
@@ -20,12 +20,12 @@ void kernel_main() {
 
     uint32_t curr_addr = src_addr;
     for (uint32_t i = 0; i < num_rows; ++i) {
-        cb_reserve_back(tt::CB::c_in0, 1);
-        uint32_t src_buffer_l1_addr = get_write_ptr(tt::CB::c_in0);
+        cb_reserve_back(tt::CBIndex::c_0, 1);
+        uint32_t src_buffer_l1_addr = get_write_ptr(tt::CBIndex::c_0);
         noc_async_read_page(i, s0, src_buffer_l1_addr);
         noc_async_read_barrier();
         volatile tt_l1_ptr uint16_t* out_stick = reinterpret_cast<volatile tt_l1_ptr uint16_t*>(src_buffer_l1_addr);
-        cb_push_back(tt::CB::c_in0, 1);
+        cb_push_back(tt::CBIndex::c_0, 1);
     }
 
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm.cpp
index 69b14054b98..63b58b42ccf 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/kernels/dataflow/writer_permute_interleaved_rm.cpp
@@ -26,7 +26,7 @@ void kernel_main() {
         dest_strides[i - 1] = get_arg_val<uint32_t>(i + 2*N);
     }
 
-    uint32_t src_buffer_l1_addr = get_write_ptr(tt::CB::c_in0);
+    uint32_t src_buffer_l1_addr = get_write_ptr(tt::CBIndex::c_0);
     uint32_t curr_addr = dst_addr;
     for (uint32_t row = 0; row < num_rows; ++row) {
         // Compute multi-dimensional index for the source row
@@ -50,12 +50,12 @@ void kernel_main() {
         for(uint32_t i = 0; i < N - 1; ++i) {
             dest_linear_idx += dest_multi_idx[i] * dest_strides[i];
         }
-        cb_wait_front(tt::CB::c_in0, 1);
-        uint32_t l1_read_addr = get_read_ptr(tt::CB::c_in0);
+        cb_wait_front(tt::CBIndex::c_0, 1);
+        uint32_t l1_read_addr = get_read_ptr(tt::CBIndex::c_0);
         uint64_t dst_noc_addr = get_noc_addr(dest_linear_idx, s0);
         noc_async_write(l1_read_addr, dst_noc_addr, page_size);
         noc_async_write_barrier();
-        cb_pop_front(tt::CB::c_in0, 1);
+        cb_pop_front(tt::CBIndex::c_0, 1);
     }
 
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_program_factory.cpp
index af8b1d8cb7f..bba7bed1a88 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/permute/device/permute_program_factory.cpp
@@ -54,7 +54,7 @@ PermuteDeviceOperation::SingleCore::cached_program_t PermuteDeviceOperation::Sin
 
     tt::tt_metal::Device* device = input_tensor.device();
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_pages_to_read = 1;
 
     CoreRange core({0, 0}, {0, 0});
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_op.cpp
index 1f84d4e1cae..8a2eb725e8b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_op.cpp
@@ -7,6 +7,7 @@
 #include "ttnn/tensor/tensor_utils.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 
 namespace ttnn::operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
index 96d173d1712..9384b0dbacd 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/repeat/device/repeat_program_factory.cpp
@@ -8,6 +8,7 @@
 #include "ttnn/operation.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 
 namespace ttnn::operations::data_movement::detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/kernels/dataflow/reader_unary_reshape_stick_layout_interleaved_multi_core.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/kernels/dataflow/reader_unary_reshape_stick_layout_interleaved_multi_core.cpp
index cb0ed4a8e83..c13bec250ab 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/kernels/dataflow/reader_unary_reshape_stick_layout_interleaved_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/kernels/dataflow/reader_unary_reshape_stick_layout_interleaved_multi_core.cpp
@@ -17,7 +17,7 @@ void kernel_main() {
     constexpr uint32_t old_stick_size = get_compile_time_arg_val(1);
 
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
 
     #define stick_size_is_pow2 get_compile_time_arg_val(2) == 1
     #if (stick_size_is_pow2)
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp
index 21ff349a32c..88623809d65 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_op.cpp
@@ -9,6 +9,7 @@
 #include "ttnn/tensor/tensor_utils.hpp"
 #include "reshape_program_factory.hpp"
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 
 namespace ttnn::operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp
index 1ca7f207b56..4a1437cd16e 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp
@@ -8,6 +8,8 @@
 #include "tt_metal/common/constants.hpp"
 #include "ttnn/operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement::detail {
 
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
index 5684d84eff8..8735a5bc174 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
@@ -114,7 +114,7 @@ ttnn::Shape tiling_reshape_corrector(const ttnn::Shape& shape) {
     const int8_t correction_1 =(ttnn::types::TILE_SIZE - (int)padded[-1] % ttnn::types::TILE_SIZE) % ttnn::types::TILE_SIZE;
     if(rank == 1)
     {
-        return ttnn::Shape({shape[0]},{padded[0]+correction_1});
+        return ttnn::Shape({1, shape[0]}, {32, padded[0]+correction_1});
     }
     const int8_t correction_2 =(ttnn::types::TILE_SIZE - (int)padded[-2] % ttnn::types::TILE_SIZE) % ttnn::types::TILE_SIZE;
     switch(rank)
@@ -172,13 +172,16 @@ ttnn::Tensor ReshapeViewOperation::invoke(const ttnn::Tensor& tensor, const ttnn
     //For view the following cases work:
     //RM: The last dimension is the same
     //Tiled: The last two dimensions are the same or there is no padding on the second last dimension
+    const uint32_t shape_second_last_dim = shape.rank() >= 2 ? shape[-2] : 1;
+    const uint32_t tensor_shape_second_last_dim = tensor_shape.rank() >= 2 ? tensor_shape[-2] : 1;
     bool this_is_view = (tensor_shape[-1] == shape[-1]) &&
         ((tensor.get_layout() == ttnn::ROW_MAJOR_LAYOUT) || //Its row major
-        (tensor_shape[-2]==shape[-2]) || //Second last dimension is the same
-        (shape[-2]%ttnn::types::TILE_SIZE==0 && tensor_shape[-2]%ttnn::types::TILE_SIZE==0)); //There is no padding on the second last dimension
+        (shape_second_last_dim==tensor_shape_second_last_dim) || //Second last dimension is the same
+        (shape_second_last_dim%ttnn::types::TILE_SIZE==0 && tensor_shape_second_last_dim%ttnn::types::TILE_SIZE==0)); //There is no padding on the second last dimension
+
     bool tile_tensor_view_reshape_possible = (layout == ttnn::Layout::TILE and
-        ((shape.with_tile_padding()[-2] % ttnn::TILE_SIZE == 0) and (shape.with_tile_padding()[-1] % ttnn::TILE_SIZE == 0)) and
-        (tensor_shape.with_tile_padding()[-1] == shape.with_tile_padding()[-1])
+        shape.with_tile_padding().rank() >= 2 and shape.with_tile_padding()[-2] % ttnn::TILE_SIZE == 0 and shape.with_tile_padding()[-1] % ttnn::TILE_SIZE == 0 and
+        tensor_shape.with_tile_padding()[-1] == shape.with_tile_padding()[-1]
         );
 
     if (!(ttnn::has_storage_type_of(tensor, ttnn::StorageType::DEVICE)) or tile_tensor_view_reshape_possible) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/compute/eltwise_copy.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/compute/eltwise_copy.cpp
index a20b181877e..5866e52b20b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/compute/eltwise_copy.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/compute/eltwise_copy.cpp
@@ -12,20 +12,20 @@ namespace NAMESPACE {
 void MAIN {
     uint32_t per_core_tile_cnt = get_arg_val<uint32_t>(0);
 
-    unary_op_init_common(tt::CB::c_in0);
+    unary_op_init_common(tt::CBIndex::c_0);
     for(uint32_t b=0;b<per_core_tile_cnt;++b)
     {
         acquire_dst();
 
         // Pop tile after tile, copy to DST and pack
-        cb_wait_front(tt::CB::c_in0, 1);
-        cb_reserve_back(tt::CB::c_out0, 1);
-        copy_tile(tt::CB::c_in0, 0, 0);
+        cb_wait_front(tt::CBIndex::c_0, 1);
+        cb_reserve_back(tt::CBIndex::c_16, 1);
+        copy_tile(tt::CBIndex::c_0, 0, 0);
 
-        pack_tile(0, tt::CB::c_out0);
+        pack_tile(0, tt::CBIndex::c_16);
 
-        cb_pop_front(tt::CB::c_in0, 1);
-        cb_push_back(tt::CB::c_out0, 1);
+        cb_pop_front(tt::CBIndex::c_0, 1);
+        cb_push_back(tt::CBIndex::c_16, 1);
 
         release_dst();
     }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.cpp
index b899760c02a..e95ede88e65 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.cpp
@@ -8,6 +8,7 @@
 
 #include "interleaved_to_sharded_program_factory.hpp"
 
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.hpp
index cbfe5f49efa..c67732ca00b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_op.hpp
@@ -11,13 +11,13 @@
 namespace ttnn::operations::data_movement {
 
 struct InterleavedToShardedDeviceOperation {
-    const MemoryConfig output_mem_config;
-    const DataType output_dtype;
+    const tt::tt_metal::MemoryConfig output_mem_config;
+    const tt::tt_metal::DataType output_dtype;
 
     void validate(const std::vector<Tensor> &input_tensors) const;
     std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
     std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
-    operation::ProgramWithCallbacks create_program(
+    tt::tt_metal::operation::ProgramWithCallbacks create_program(
         const std::vector<Tensor> &input_tensors, std::vector<Tensor> &output_tensors) const;
 
     static constexpr auto attribute_names =
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp
index b07f464e4ca..5284220b9cc 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.cpp
@@ -10,6 +10,7 @@
 #include "ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement::detail {
 
@@ -75,13 +76,13 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
 
 
     auto all_cores = shard_spec.grid;
-    uint32_t input_cb_index = tt::CB::c_in0;
-    uint32_t scratch_cb_index = tt::CB::c_in1;
+    uint32_t input_cb_index = tt::CBIndex::c_0;
+    uint32_t scratch_cb_index = tt::CBIndex::c_1;
     uint32_t out_cb_index = input_cb_index;
     uint32_t num_input_units = num_units_per_shard;
     uint32_t output_page_size = align(output_unit_size, dst_buffer->alignment());
     if (convert_df) {
-        out_cb_index = tt::CB::c_out0;
+        out_cb_index = tt::CBIndex::c_16;
         uint32_t input_page_size = align(input_unit_size, src_buffer->alignment());
         tt::tt_metal::CircularBufferConfig input_cb_out_config =
             tt::tt_metal::CircularBufferConfig(num_input_units * input_page_size, {{input_cb_index, input_cb_data_format}})
@@ -148,7 +149,7 @@ operation::ProgramWithCallbacks interleaved_to_sharded_multi_core(
     if (convert_df) {
         compute_kernel_id = tt::tt_metal::CreateKernel(
             program,
-            "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels//compute/eltwise_copy.cpp",
+            "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/compute/eltwise_copy.cpp",
             all_cores,
             tt::tt_metal::ComputeConfig{});
     }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
index a3dc125837a..095ed0d6b8c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/interleaved_to_sharded.cpp
@@ -9,6 +9,8 @@
 #include "interleaved_to_sharded.hpp"
 #include "tt_metal/common/work_split.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement{
 
 ttnn::Tensor InterleavedToShardedOperation::invoke(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.cpp
index 6d28f031836..03a8ab77e6e 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.cpp
@@ -11,7 +11,7 @@
 #include "tt_metal/common/work_split.hpp"
 
 using namespace tt::constants;
-
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.hpp
index 42b757d7e42..7aea695160a 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_op.hpp
@@ -11,12 +11,12 @@
 namespace ttnn::operations::data_movement {
 
 struct ReshardDeviceOperation {
-    const MemoryConfig output_mem_config;
+    const tt::tt_metal::MemoryConfig output_mem_config;
 
     void validate_with_output_tensors(const std::vector<Tensor> &input_tensors, const std::vector<std::optional<Tensor>> &output_tensors) const;
     std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
     std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors, const std::vector<std::optional<Tensor>> &output_tensors) const;
-    operation::ProgramWithCallbacks create_program(
+    tt::tt_metal::operation::ProgramWithCallbacks create_program(
         const std::vector<Tensor> &input_tensors, std::vector<Tensor> &output_tensors) const;
 
     static constexpr auto attribute_names = std::make_tuple("output_mem_config");
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
index d50efaaccc0..256230827c7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp
@@ -12,6 +12,7 @@
 #include "tt_metal/host_api.hpp"
 #include "reshard_program_factory.hpp"
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 
 namespace ttnn::operations::data_movement::detail {
@@ -314,7 +315,7 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu
 
     auto local_core_type = local_tensor.buffer()->core_type();
     auto remote_core_type = remote_tensor.buffer()->core_type();
-    constexpr uint32_t cb_index = tt::CB::c_in0;
+    constexpr uint32_t cb_index = tt::CBIndex::c_0;
     auto local_cores = corerange_to_cores(
         local_shard_spec.grid, std::nullopt, local_shard_spec.orientation == ShardOrientation::ROW_MAJOR);
     auto remote_cores =
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.cpp
index 2b0c1348bc1..797ef66186c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/reshard.cpp
@@ -7,6 +7,8 @@
 #include "device/reshard_op.hpp"
 #include "reshard.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement{
 
 ttnn::Tensor ReshardOperation::invoke(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.cpp
index f736258f7d6..7dd6f841526 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.cpp
@@ -8,6 +8,8 @@
 
 #include "sharded_to_interleaved_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 void ShardedToInterleavedDeviceOperation::validate(const std::vector<Tensor>& input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.hpp
index 3bfbb018599..b255624b197 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_op.hpp
@@ -11,13 +11,13 @@
 namespace ttnn::operations::data_movement {
 
 struct ShardedToInterleavedDeviceOperation {
-    const MemoryConfig output_mem_config;
-    const DataType output_dtype;
+    const tt::tt_metal::MemoryConfig output_mem_config;
+    const tt::tt_metal::DataType output_dtype;
 
     void validate(const std::vector<Tensor> &input_tensors) const;
     std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor> &input_tensors) const;
     std::vector<Tensor> create_output_tensors(const std::vector<Tensor> &input_tensors) const;
-    operation::ProgramWithCallbacks create_program(
+    tt::tt_metal::operation::ProgramWithCallbacks create_program(
         const std::vector<Tensor> &input_tensors, std::vector<Tensor> &output_tensors) const;
 
     static constexpr auto attribute_names =
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
index b6fc9f0e5c8..8cba763bd54 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.cpp
@@ -9,8 +9,10 @@
 #include "ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_common.hpp"
 #include "ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.hpp"
 
-using namespace tt::constants;
 using namespace tt;
+using namespace tt::constants;
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement::detail {
 
 operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
@@ -81,7 +83,7 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
 
     bool convert_df = input_cb_data_format != output_cb_data_format;
 
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     uint32_t out_cb_index = src0_cb_index;
     uint32_t num_input_units = num_units_per_shard;
     uint32_t input_page_size = align(input_unit_size, input.buffer()->alignment());
@@ -91,7 +93,7 @@ operation::ProgramWithCallbacks sharded_to_interleaved_multi_core(
             .set_globally_allocated_address(*input.buffer());
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
     if (convert_df) {
-        out_cb_index = CB::c_out0;
+        out_cb_index = CBIndex::c_16;
         uint32_t output_page_size = align(output_unit_size, output.buffer()->alignment());
         tt_metal::CircularBufferConfig output_cb_out_config =
             tt_metal::CircularBufferConfig(num_input_units * output_page_size, {{out_cb_index, output_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp
index 9a2841aefcd..bb9ee2b6f72 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved.cpp
@@ -8,6 +8,8 @@
 #include "device/sharded_to_interleaved_op.hpp"
 #include "sharded_to_interleaved.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement{
 
 ttnn::Tensor ShardedToInterleavedOperation::invoke(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp
index ad176aeebd6..0cb4a3865c4 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/sharded_to_interleaved_pybind.cpp
@@ -10,6 +10,8 @@
 #include "sharded_to_interleaved.hpp"
 #include "ttnn/types.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 namespace detail {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.cpp
index 043dea332b1..4fd48c1bb01 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/interleaved_to_sharded_partial/device/interleaved_to_sharded_partial_op.cpp
@@ -8,6 +8,8 @@
 
 #include "ttnn/cpp/ttnn/operations/data_movement/sharded/interleaved_to_sharded/device/interleaved_to_sharded_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 void InterleavedToShardedPartialDeviceOperation::validate(const std::vector<Tensor>& input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.cpp
index 220036be692..5da88109392 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/sharded_partial/sharded_to_interleaved_partial/device/sharded_to_interleaved_partial_op.cpp
@@ -8,6 +8,7 @@
 
 #include "ttnn/cpp/ttnn/operations/data_movement/sharded/sharded_to_interleaved/device/sharded_to_interleaved_program_factory.hpp"
 
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_sharded.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_sharded.cpp
index b987827dccc..15eaa7a64c7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_sharded.cpp
@@ -20,8 +20,8 @@ void kernel_main() {
     tt_l1_ptr uint32_t * chunk_start_id           = (tt_l1_ptr uint32_t*)(get_arg_addr(1 + num_cores_read * 3));
     tt_l1_ptr uint32_t * chunk_num_sticks         = (tt_l1_ptr uint32_t*)(chunk_start_id + 1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
 
     cb_reserve_back(cb_out0, num_sticks_unpadded);
     uint32_t l1_read_addr = get_write_ptr(cb_in0);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
index 6398986a22d..fe18e06f7d7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_op.cpp
@@ -6,6 +6,8 @@
 #include "slice_op.hpp"
 #include "slice_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 inline __attribute__((always_inline)) uint32_t get_upper_dims_compressed(const tt::tt_metal::LegacyShape& shape) {
@@ -102,18 +104,11 @@ void SliceDeviceOperation::validate_with_output_tensors(
             (output_tensor_shape[-1] % TILE_WIDTH == 0) && (this->slice_start[-1] % TILE_WIDTH == 0),
             "Can only unpad tilized tensor with full tiles");
     } else if (input_tensor_a.get_layout() == Layout::ROW_MAJOR) {
-        TT_FATAL(
-            (output_tensor_shape[-1] * input_tensor_a.element_size() % sizeof(uint32_t) == 0),
-            "An unpadding slice operations for a RowMajor layout on the output tensor requires the last dimension to be on a 32 bit boundary. For example, the final dimension needs to be divisible by 2 for bfloat16. The resulting tensor shape is {}, which is not 4B aligned as the last dimension is {}",
-                        output_tensor_shape[-1], input_tensor_a.element_size());
         if (has_step) {
             for (uint32_t i = 0; i < input_tensor_a.get_legacy_shape().rank(); i++) {
                 TT_FATAL(step[i] > 0, "Step({}) = {} should be positive", i, step[i]);
             }
         }
-        else {
-            TT_FATAL(this->slice_start[-1] * input_tensor_a.element_size() % sizeof(uint32_t) == 0, "Slice needs to start at an aligned position");
-        }
     }
 }
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
index cc8054464f2..9db70c681c6 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.cpp
@@ -11,6 +11,7 @@
 
 #include "slice_op.hpp"
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 
 namespace ttnn::operations::data_movement::detail {
@@ -277,12 +278,12 @@ operation::ProgramWithCallbacks slice_rm_strided_single_core_n_dims(const Tensor
 
 
     tt::tt_metal::CircularBufferConfig cb_src0_config =
-    tt::tt_metal::CircularBufferConfig(1*page_size_input, {{tt::CB::c_in0, cb_data_format}})
-        .set_page_size(tt::CB::c_in0, page_size_input);
+    tt::tt_metal::CircularBufferConfig(1*page_size_input, {{tt::CBIndex::c_0, cb_data_format}})
+        .set_page_size(tt::CBIndex::c_0, page_size_input);
 
     tt::tt_metal::CircularBufferConfig cb_dst0_config =
-    tt::tt_metal::CircularBufferConfig(2*page_size_output, {{tt::CB::c_intermed0, cb_data_format}})
-        .set_page_size(tt::CB::c_intermed0, page_size_output);
+    tt::tt_metal::CircularBufferConfig(2*page_size_output, {{tt::CBIndex::c_24, cb_data_format}})
+        .set_page_size(tt::CBIndex::c_24, page_size_output);
 
     CoreRange core({0, 0}, {0, 0});
     auto cb_input_tensor = tt::tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
@@ -609,7 +610,7 @@ operation::ProgramWithCallbacks slice_rm_multi_core_sharded(
         .set_page_size(src0_cb_index, stick_size_padded).set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
 
-    uint32_t output_cb_index = tt::CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig cb_output_config = tt::tt_metal::CircularBufferConfig(shard_height_unpadded * stick_size_unpadded, {{output_cb_index, dst_cb_data_format}})
         .set_page_size(output_cb_index, stick_size_unpadded).set_globally_allocated_address(*output.buffer());
     auto cb_output = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_output_config);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/device/split_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/split/device/split_op.cpp
index ae7db6381f2..0584f39c4d3 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/split/device/split_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/split/device/split_op.cpp
@@ -9,6 +9,7 @@
 
 #include "split_program_factory.hpp"
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 
 namespace ttnn::operations::data_movement {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/split/device/split_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/split/device/split_program_factory.cpp
index 45bf21a7f88..17d43bd79ea 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/split/device/split_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/split/device/split_program_factory.cpp
@@ -7,6 +7,8 @@
 #include "tt_metal/common/constants.hpp"
 #include "ttnn/operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement::detail {
 
 void setup_runtime(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/dataflow/reader_unary_stick_layout_split_rows_interleaved.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/dataflow/reader_unary_stick_layout_split_rows_interleaved.cpp
index 4f5322f8acc..bcc6284d5fa 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/dataflow/reader_unary_stick_layout_split_rows_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/dataflow/reader_unary_stick_layout_split_rows_interleaved.cpp
@@ -8,7 +8,7 @@
 void kernel_main() {
 
     // Constexpr
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
     constexpr uint32_t tile_height = 32;
 
     const uint32_t src_addr                   = get_arg_val<uint32_t>(0);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp
index 1df1a5f8243..12d921eb18b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.cpp
@@ -6,6 +6,9 @@
 #include "tilize_program_factory.hpp"
 #include "ttnn/run_operation.hpp"
 #include "tt_metal/common/constants.hpp"
+
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 void Tilize::validate(const std::vector<Tensor>& input_tensors) const {
     const auto& input_tensor_a = input_tensors.at(0);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.hpp
index 7d2953d8039..a93a0c8c68d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_op.hpp
@@ -12,15 +12,15 @@
 namespace ttnn::operations::data_movement {
 
 struct Tilize {
-    const MemoryConfig output_mem_config;
-    const DataType output_dtype;
+    const tt::tt_metal::MemoryConfig output_mem_config;
+    const tt::tt_metal::DataType output_dtype;
     const bool use_multicore;
 
     void validate(const std::vector<Tensor>& input_tensors) const;
     std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
     std::vector<Tensor> create_output_tensors(
         const std::vector<Tensor>& input_tensors, const std::vector<std::optional<Tensor>>& output_tensors) const;
-    operation::ProgramWithCallbacks create_program(
+    tt::tt_metal::operation::ProgramWithCallbacks create_program(
         const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const;
 };
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
index 803d300763a..c29e8c152a8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.cpp
@@ -14,6 +14,7 @@
 #include "tt_metal/host_api.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement::detail {
 
@@ -73,7 +74,7 @@ operation::ProgramWithCallbacks tilize_single_core(const Tensor& a, Tensor& outp
                               .set_page_size(src0_cb_index, input_single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, src0_cb_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = num_tiles_per_block;
     auto cb_output_config = tt::tt_metal::CircularBufferConfig(
                                 num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
@@ -173,10 +174,10 @@ operation::ProgramWithCallbacks tilize_multi_core_interleaved(const Tensor& a, T
     auto [ncores, all_cores, core_range, core_range_cliff, nblocks_per_core, nblocks_per_core_cliff] =
         ttnn::split_blocks_for_tilize(grid_size, nblocks);
 
-    create_cb(tt::CB::c_in0, program, all_cores, input_single_tile_size, ntiles_per_block, input_cb_data_format);
+    create_cb(tt::CBIndex::c_0, program, all_cores, input_single_tile_size, ntiles_per_block, input_cb_data_format);
 
     auto [output_cb_index, _] =
-        create_cb(tt::CB::c_out0, program, all_cores, output_single_tile_size, ntiles_per_block, output_cb_data_format);
+        create_cb(tt::CBIndex::c_16, program, all_cores, output_single_tile_size, ntiles_per_block, output_cb_data_format);
 
     Buffer* src0_buffer = a.buffer();
     Buffer* dst_buffer = output.buffer();
@@ -332,7 +333,7 @@ operation::ProgramWithCallbacks tilize_multi_core_sharded(const Tensor& input, T
     uint32_t num_cores = all_cores.num_cores();
 
     auto [src0_cb_index, cb_src0] = create_cb(
-        tt::CB::c_in0,
+        tt::CBIndex::c_0,
         program,
         all_cores,
         input_single_tile_size,
@@ -341,7 +342,7 @@ operation::ProgramWithCallbacks tilize_multi_core_sharded(const Tensor& input, T
         input.buffer());
 
     auto [output_cb_index, cb_output] = create_cb(
-        tt::CB::c_out0,
+        tt::CBIndex::c_16,
         program,
         all_cores,
         output_single_tile_size,
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.hpp
index 4452fd5aaae..980fc23c4a7 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/device/tilize_program_factory.hpp
@@ -8,8 +8,8 @@
 
 namespace ttnn::operations::data_movement::detail {
 
-operation::ProgramWithCallbacks tilize_single_core(const Tensor& a, Tensor& output);
-operation::ProgramWithCallbacks tilize_multi_core(const Tensor& a, Tensor& output);
+tt::tt_metal::operation::ProgramWithCallbacks tilize_single_core(const Tensor& a, Tensor& output);
+tt::tt_metal::operation::ProgramWithCallbacks tilize_multi_core(const Tensor& a, Tensor& output);
 
 
 }  // namespace ttnn::operations::data_movement::detail
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp
index f3df2272c26..9b6a3f5b33f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize/tilize.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 ttnn::Tensor ExecuteTilize::invoke(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.cpp
index 646af783bab..8c49d15598d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.cpp
@@ -8,6 +8,8 @@
 #include "tilize_with_val_padding_program_factory.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 void TilizeWithValPadding::validate(const std::vector<Tensor>& input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.hpp
index 61a7f86c2a3..2317ba86ab5 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_op.hpp
@@ -15,15 +15,15 @@ namespace ttnn::operations::data_movement {
 struct TilizeWithValPadding {
     const tt::tt_metal::LegacyShape output_tensor_shape;
     const PadValue pad_value;
-    const MemoryConfig output_mem_config;
-    const DataType output_dtype;
+    const tt::tt_metal::MemoryConfig output_mem_config;
+    const tt::tt_metal::DataType output_dtype;
     const bool use_multicore;
 
     void validate(const std::vector<Tensor>& input_tensors) const;
     std::vector<tt::tt_metal::LegacyShape> compute_output_shapes(const std::vector<Tensor>& input_tensors) const;
     std::vector<Tensor> create_output_tensors(
         const std::vector<Tensor>& input_tensors, const std::vector<std::optional<Tensor>>& output_tensors) const;
-    operation::ProgramWithCallbacks create_program(
+    tt::tt_metal::operation::ProgramWithCallbacks create_program(
         const std::vector<Tensor>& input_tensors, std::vector<Tensor>& output_tensors) const;
 };
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
index 409f6bde303..87c47efdd4f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.cpp
@@ -16,6 +16,7 @@
 
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement::detail {
 
@@ -138,7 +139,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_single_core(
             .set_page_size(src0_cb_index, input_single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, src0_cb_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = num_tiles_per_block;
     tt::tt_metal::CircularBufferConfig cb_output_config =
         tt::tt_metal::CircularBufferConfig(
@@ -252,10 +253,10 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_interleaved(
     uint32_t padded_row_size_bytes = output.get_legacy_shape()[-1] * a.element_size();  // Assuming bfloat16 dataformat
 
     auto [src0_cb_index, cb_src0] =
-        create_cb(tt::CB::c_in0, program, all_cores, input_single_tile_size, num_tiles_per_row, input_cb_data_format);
+        create_cb(tt::CBIndex::c_0, program, all_cores, input_single_tile_size, num_tiles_per_row, input_cb_data_format);
 
     auto [output_cb_index, cb_output] = create_cb(
-        tt::CB::c_out0, program, all_cores, output_single_tile_size, num_tiles_per_row, output_cb_data_format);
+        tt::CBIndex::c_16, program, all_cores, output_single_tile_size, num_tiles_per_row, output_cb_data_format);
 
     Buffer* src0_buffer = a.buffer();
     Buffer* dst_buffer = output.buffer();
@@ -412,7 +413,7 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_sharded(
     uint32_t num_padded_rows = output.get_legacy_shape()[-2] - a.get_legacy_shape()[-2];
 
     auto [src0_cb_index, cb_src0] = create_cb(
-        tt::CB::c_in1,
+        tt::CBIndex::c_1,
         program,
         all_cores,
         input_shard_width_bytes,
@@ -421,13 +422,13 @@ operation::ProgramWithCallbacks tilize_with_val_padding_multi_core_sharded(
         src_sharded ? a.buffer() : nullptr);
 
     auto [src1_cb_index, cb_src1] = create_cb(
-        tt::CB::c_in0, program, all_cores, input_single_tile_size, ntiles_per_batch * 2, input_cb_data_format);
+        tt::CBIndex::c_0, program, all_cores, input_single_tile_size, ntiles_per_batch * 2, input_cb_data_format);
 
     auto [src2_cb_index, cb_src2] =
-        create_cb(tt::CB::c_in2, program, all_cores, input_shard_width_bytes, 1, input_cb_data_format);
+        create_cb(tt::CBIndex::c_2, program, all_cores, input_shard_width_bytes, 1, input_cb_data_format);
 
     auto [output_cb_index, cb_output] = create_cb(
-        tt::CB::c_out0,
+        tt::CBIndex::c_16,
         program,
         all_cores,
         output_single_tile_size,
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
index 6bc1ce44042..c8a50e2da7a 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/device/tilize_with_val_padding_program_factory.hpp
@@ -10,11 +10,11 @@
 using namespace tt::constants;
 
 namespace ttnn::operations::data_movement::detail {
-operation::ProgramWithCallbacks tilize_with_val_padding_single_core(
+tt::tt_metal::operation::ProgramWithCallbacks tilize_with_val_padding_single_core(
     const Tensor& a, Tensor& output, const ttnn::PadValue pad_value);
 
 
-operation::ProgramWithCallbacks tilize_with_val_padding_multi_core(
+tt::tt_metal::operation::ProgramWithCallbacks tilize_with_val_padding_multi_core(
     const Tensor& a, Tensor& output, const ttnn::PadValue pad_value);
 
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.cpp b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.cpp
index 81728917577..fd553befe1c 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.cpp
@@ -8,6 +8,7 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement {
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/compute/transpose_wh.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/compute/transpose_wh.cpp
index e92a9f06809..2d4449a18f8 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/compute/transpose_wh.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/compute/transpose_wh.cpp
@@ -12,23 +12,23 @@ void MAIN {
 
     uint32_t NHtWt = get_arg_val<uint32_t>(0);
 
-    transpose_wh_init(tt::CB::c_in0);
+    transpose_wh_init(tt::CBIndex::c_0);
 
     // transpose a row-major block:
     // - assumes the tiles come in in column major order from reader
     // - uses reader_unary_transpose_wh
     // - transpose_wh each tile
     for (uint32_t n = 0; n < NHtWt; n++) {
-        cb_wait_front(tt::CB::c_in0, 1);
-        cb_reserve_back(tt::CB::c_out0, 1);
+        cb_wait_front(tt::CBIndex::c_0, 1);
+        cb_reserve_back(tt::CBIndex::c_16, 1);
 
         acquire_dst();
-        transpose_wh_tile(tt::CB::c_in0, 0, 0);
-        pack_tile(0, tt::CB::c_out0);
+        transpose_wh_tile(tt::CBIndex::c_0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_16);
         release_dst();
 
-        cb_push_back(tt::CB::c_out0, 1);
-        cb_pop_front(tt::CB::c_in0, 1);
+        cb_push_back(tt::CBIndex::c_16, 1);
+        cb_pop_front(tt::CBIndex::c_0, 1);
     }
 }
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/compute/transpose_wh_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/compute/transpose_wh_rm.cpp
index 72e36bc478a..9332714f14e 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/compute/transpose_wh_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/compute/transpose_wh_rm.cpp
@@ -127,15 +127,15 @@ void MAIN {
 
 
     #ifdef SHARDED
-    constexpr auto cb_in = tt::CB::c_intermed0;
-    constexpr auto cb_tilize = tt::CB::c_intermed1;
-    constexpr auto cb_untilize = tt::CB::c_intermed2;
-    constexpr auto cb_out = (Ht > 8) ? tt::CB::c_intermed3 : tt::CB::c_out0; // temporary fix until pack_untilze is fully fixed
+    constexpr auto cb_in = tt::CBIndex::c_24;
+    constexpr auto cb_tilize = tt::CBIndex::c_25;
+    constexpr auto cb_untilize = tt::CBIndex::c_26;
+    constexpr auto cb_out = (Ht > 8) ? tt::CBIndex::c_27 : tt::CBIndex::c_16; // temporary fix until pack_untilze is fully fixed
     #else
-    constexpr auto cb_in = tt::CB::c_in0;
-    constexpr auto cb_tilize = tt::CB::c_intermed0;
-    constexpr auto cb_untilize = tt::CB::c_intermed1;
-    constexpr auto cb_out = tt::CB::c_out0;
+    constexpr auto cb_in = tt::CBIndex::c_0;
+    constexpr auto cb_tilize = tt::CBIndex::c_24;
+    constexpr auto cb_untilize = tt::CBIndex::c_25;
+    constexpr auto cb_out = tt::CBIndex::c_16;
     #endif
 
     unary_op_init_common(cb_in, cb_out);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_partitioned_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_partitioned_rm.cpp
index 71012f38ce1..d837d51973d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_partitioned_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_partitioned_rm.cpp
@@ -24,7 +24,7 @@ void kernel_main() {
 
     constexpr uint32_t CH = C * H;
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
 
     const uint32_t stick_size_bytes = W_size_bytes;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
index a2ae4767150..e6160cdfa5f 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
@@ -45,11 +45,11 @@ void kernel_main() {
     }
     if constexpr (needs_padding) {
         // Add padding
-        cb_reserve_back(tt::CB::c_in1, 1);
-        uint32_t l1_write_addr = get_write_ptr(tt::CB::c_in1);
+        cb_reserve_back(tt::CBIndex::c_1, 1);
+        uint32_t l1_write_addr = get_write_ptr(tt::CBIndex::c_1);
         // Fill with padding value
         // if bfloat16 num_writes = FACE_WIDTH / (sizeof(uint32_t))/(element_size)
         tt::data_movement::common::fill_with_val(l1_write_addr, num_writes, padding_val_packed);
-        cb_push_back(tt::CB::c_in1, 1);
+        cb_push_back(tt::CBIndex::c_1, 1);
     }
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_start_id_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_start_id_rm.cpp
index c063d4eb109..60c28b4e5de 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_start_id_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_wh_interleaved_start_id_rm.cpp
@@ -21,7 +21,7 @@ void kernel_main() {
     constexpr uint32_t W_size_bytes = get_compile_time_arg_val(7);
     constexpr uint32_t l1_write_offset_bytes = get_compile_time_arg_val(8);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
 
     const uint32_t stick_size_bytes = W_size_bytes;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_wh_sharded_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_wh_sharded_rm.cpp
index cb1775c0ec8..dbd230adcdd 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_wh_sharded_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_wh_sharded_rm.cpp
@@ -17,8 +17,8 @@ void kernel_main() {
     constexpr uint32_t W_size_bytes = get_compile_time_arg_val(5);
     constexpr uint32_t l1_write_offset_bytes = get_compile_time_arg_val(6);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in = tt::CB::c_intermed0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in = tt::CBIndex::c_24;
 
     const uint32_t stick_size_bytes = W_size_bytes;
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
index d1c8de82377..77343dc50fa 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp
@@ -160,9 +160,9 @@ void kernel_main() {
 
     // add padding
     if constexpr (needs_padding) {
-        cb_wait_front(tt::CB::c_in1, 1);
+        cb_wait_front(tt::CBIndex::c_1, 1);
 
-        uint32_t l1_read_ptr = get_read_ptr(tt::CB::c_in1);
+        uint32_t l1_read_ptr = get_read_ptr(tt::CBIndex::c_1);
 
         constexpr uint32_t c_t = C_t - 1;
         constexpr uint8_t C_in_tile = C % TILE_HEIGHT;
@@ -203,6 +203,6 @@ void kernel_main() {
             }
         }
         noc_async_write_barrier();
-        cb_pop_front(tt::CB::c_in1, 1);
+        cb_pop_front(tt::CBIndex::c_1, 1);
     }
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_wh_sharded_rm.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_wh_sharded_rm.cpp
index 23e76b658be..583491377d4 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_wh_sharded_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_wh_sharded_rm.cpp
@@ -16,8 +16,8 @@ void kernel_main() {
     constexpr uint32_t H_size_bytes = get_compile_time_arg_val(5);
     constexpr uint32_t l1_read_offset_bytes = get_compile_time_arg_val(6);
 
-    constexpr auto cb_out = tt::CB::c_intermed3;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_out = tt::CBIndex::c_27;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
 
 
     const uint32_t stick_size_bytes = H_size_bytes;
diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
index 9c43c604645..eb254bfbe15 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp
@@ -11,6 +11,8 @@
 #include "tt_log.h"
 #include "ttnn/operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement::detail {
 
 using namespace tt::constants;
@@ -566,8 +568,8 @@ operation::ProgramWithCallbacks transpose_hc_multi_core_tiled_interleaved(const
     uint32_t num_cores_total = num_cores_x * num_cores_y;
     CoreRange total_cores({0, 0}, {num_cores_x-1, num_cores_y-1});
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
-    uint32_t padding_cb_index = tt::CB::c_in1;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
+    uint32_t padding_cb_index = tt::CBIndex::c_1;
 
     tt::tt_metal::CircularBufferConfig cb_src0_config =
         tt::tt_metal::CircularBufferConfig(2 * single_tile_size, {{src0_cb_index, cb_data_format}})
@@ -614,7 +616,7 @@ operation::ProgramWithCallbacks transpose_hc_multi_core_tiled_interleaved(const
     tt::tt_metal::Buffer *dst_buffer = output.buffer();
     bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
     std::vector<uint32_t> writer_compile_time_args =
-    {(std::uint32_t)dst_is_dram, a.element_size(), tt::CB::c_in0, C, H, W, tile_shape[0], tile_shape[1], face_shape[0], face_shape[1], (uint32_t) needs_padding};
+    {(std::uint32_t)dst_is_dram, a.element_size(), tt::CBIndex::c_0, C, H, W, tile_shape[0], tile_shape[1], face_shape[0], face_shape[1], (uint32_t) needs_padding};
 
     tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel(
         program,
@@ -1210,12 +1212,12 @@ operation::ProgramWithCallbacks transpose_hc_multi_core_sharded(const Tensor &a,
 
     tt::tt_metal::Buffer *dst_buffer = output.buffer();
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(shard_height * stick_size_bytes, {{src0_cb_index, src0_cb_data_format}})
         .set_page_size(src0_cb_index, stick_size_bytes).set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t output_cb_index = tt::CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig cb_output_config = tt::tt_metal::CircularBufferConfig(shard_height * stick_size_bytes, {{output_cb_index, dst_cb_data_format}})
         .set_page_size(output_cb_index, stick_size_bytes).set_globally_allocated_address(*output.buffer());
     auto cb_output = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_output_config);
@@ -1616,7 +1618,7 @@ operation::ProgramWithCallbacks transpose_wh_multi_core(const Tensor &a, Tensor
 		.set_page_size(src0_cb_index, src0_single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
 
-    uint32_t output_cb_index = 16; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = row_major ? ht * 2 : 2;
     tt::tt_metal::CircularBufferConfig cb_output_config = tt::tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
 		.set_page_size(output_cb_index, dst_single_tile_size);
@@ -1813,13 +1815,13 @@ operation::ProgramWithCallbacks transpose_wh_multi_core_sharded(const Tensor &a,
 
     tt::tt_metal::Buffer *dst_buffer = output.buffer();
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = num_tiles_per_shard;
     tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(num_input_tiles * src0_single_tile_size, {{src0_cb_index, src0_cb_data_format}})
 		.set_page_size(src0_cb_index, src0_single_tile_size).set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t output_cb_index = tt::CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = num_tiles_per_shard;
     tt::tt_metal::CircularBufferConfig cb_output_config = tt::tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
 		.set_page_size(output_cb_index, dst_single_tile_size).set_globally_allocated_address(*output.buffer());
@@ -2027,26 +2029,26 @@ operation::ProgramWithCallbacks transpose_wh_multi_core_sharded_rm(const Tensor
     tt::tt_metal::LegacyShape output_shape = output.get_legacy_shape();
 
     // sharded cb
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(shard_height * stick_size_bytes, {{src0_cb_index, src0_cb_data_format}})
         .set_page_size(src0_cb_index, stick_size_bytes).set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
     // sharded cb
-    uint32_t output_cb_index = tt::CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig cb_output_config = tt::tt_metal::CircularBufferConfig(stick_size_bytes * shard_height, {{output_cb_index, dst_cb_data_format}})
         .set_page_size(output_cb_index, output_page_size).set_globally_allocated_address(*output.buffer());
     auto cb_output = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_output_config);
 
     // cb_in
-    uint32_t in_cb_index = tt::CB::c_intermed0;
+    uint32_t in_cb_index = tt::CBIndex::c_24;
     uint32_t num_in_tiles = wt * 2; // double buffer
     tt::tt_metal::CircularBufferConfig cb_in_config = tt::tt_metal::CircularBufferConfig(num_in_tiles * src0_single_tile_size, {{in_cb_index, src0_cb_data_format}})
         .set_page_size(in_cb_index, src0_single_tile_size);
     auto cb_in = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_in_config);
 
     // tilize cb
-    uint32_t im_cb_index = tt::CB::c_intermed1;
+    uint32_t im_cb_index = tt::CBIndex::c_25;
     uint32_t num_im_tiles = ht * wt;
     tt::tt_metal::CircularBufferConfig cb_im_config = tt::tt_metal::CircularBufferConfig(num_im_tiles * src0_single_tile_size, {{im_cb_index, src0_cb_data_format}})
         .set_page_size(im_cb_index, src0_single_tile_size);
@@ -2054,14 +2056,14 @@ operation::ProgramWithCallbacks transpose_wh_multi_core_sharded_rm(const Tensor
 
     // untilize cb
     if (ht > 8) {
-        uint32_t im2_cb_index = tt::CB::c_intermed2;
+        uint32_t im2_cb_index = tt::CBIndex::c_26;
         uint32_t num_im2_tiles = ht;
         tt::tt_metal::CircularBufferConfig cb_im2_config = tt::tt_metal::CircularBufferConfig(num_im2_tiles * dst_single_tile_size, {{im2_cb_index, dst_cb_data_format}})
             .set_page_size(im2_cb_index, dst_single_tile_size);
         auto cb_im2 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_im2_config);
 
         // compute_output_cb
-        uint32_t out_cb_index = tt::CB::c_intermed3;
+        uint32_t out_cb_index = tt::CBIndex::c_27;
         uint32_t num_out_tiles = ht * 2; // double buffer
         tt::tt_metal::CircularBufferConfig cb_out_config = tt::tt_metal::CircularBufferConfig(num_out_tiles * dst_single_tile_size, {{out_cb_index, dst_cb_data_format}})
             .set_page_size(out_cb_index, dst_single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/pack_untilize.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/pack_untilize.cpp
index a35a1c0c193..28a05978959 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/pack_untilize.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/pack_untilize.cpp
@@ -13,16 +13,16 @@ void MAIN {
     constexpr uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
     constexpr uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
 
-    pack_untilize_init<per_core_block_tile_cnt>(tt::CB::c_in0, tt::CB::c_out0);
+    pack_untilize_init<per_core_block_tile_cnt>(tt::CBIndex::c_0, tt::CBIndex::c_16);
 
     for(uint32_t b = 0; b < per_core_block_cnt; ++ b) {
-        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
 
-        pack_untilize_block<per_core_block_tile_cnt>(tt::CB::c_in0, 1, tt::CB::c_out0);
+        pack_untilize_block<per_core_block_tile_cnt>(tt::CBIndex::c_0, 1, tt::CBIndex::c_16);
 
-        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
     }
 
     pack_untilize_uninit();
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/untilize.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/untilize.cpp
index 9aa79b8981b..c55d0073a20 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/untilize.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/untilize.cpp
@@ -12,18 +12,18 @@ void MAIN {
 
     uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
     uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
-    untilize_init(tt::CB::c_in0);
+    untilize_init(tt::CBIndex::c_0);
 
     //UNPACK(( DPRINT << "Block count=" << uint32_t(per_core_block_cnt) << " tile count=" << per_core_block_tile_cnt << ENDL() ));
 
     for(uint32_t b = 0; b < per_core_block_cnt; ++ b) {
-        cb_wait_front(tt::CB::c_in0, per_core_block_tile_cnt);
-        cb_reserve_back(tt::CB::c_out0, per_core_block_tile_cnt);
+        cb_wait_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
+        cb_reserve_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
 
-        untilize_block(tt::CB::c_in0, per_core_block_tile_cnt, tt::CB::c_out0);
+        untilize_block(tt::CBIndex::c_0, per_core_block_tile_cnt, tt::CBIndex::c_16);
 
-        cb_push_back(tt::CB::c_out0, per_core_block_tile_cnt);
-        cb_pop_front(tt::CB::c_in0, per_core_block_tile_cnt);
+        cb_push_back(tt::CBIndex::c_16, per_core_block_tile_cnt);
+        cb_pop_front(tt::CBIndex::c_0, per_core_block_tile_cnt);
     }
 }
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
index c9c2ff7431a..a40909dd1af 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp
@@ -8,6 +8,8 @@
 #include "tt_metal/common/work_split.hpp"
 #include "untilize_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 namespace untilize_helpers {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
index 60cd1285c6e..f2eda3a4476 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp
@@ -16,6 +16,7 @@
 #include "tt_metal/host_api.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement::detail {
 
@@ -87,7 +88,7 @@ operation::ProgramWithCallbacks untilize_multi_core_parallelize_column(
 
     uint32_t num_input_tiles = ntiles_per_block * 2;
     auto [src0_cb_index, cb_src0] = create_cb(
-        tt::CB::c_in0,
+        tt::CBIndex::c_0,
         program,
         all_cores,
         input_single_tile_size,
@@ -97,7 +98,7 @@ operation::ProgramWithCallbacks untilize_multi_core_parallelize_column(
 
     uint32_t num_output_tiles =  ntiles_per_block * 2;
     auto [output_cb_index, cb_output] = create_cb(
-        tt::CB::c_out0,
+        tt::CBIndex::c_16,
         program,
         all_cores,
         output_single_tile_size,
@@ -348,7 +349,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
 
     uint32_t num_input_tiles = src_sharded ? ntiles_per_block * nblocks_per_core : ntiles_per_block * 2;
     auto [src0_cb_index, cb_src0] = create_cb(
-        tt::CB::c_in0,
+        tt::CBIndex::c_0,
         program,
         all_cores,
         input_single_tile_size,
@@ -358,7 +359,7 @@ operation::ProgramWithCallbacks untilize_multi_core(
 
     uint32_t num_output_tiles = out_sharded ? ntiles_per_block * nblocks_per_core : ntiles_per_block * 2;
     auto [output_cb_index, cb_output] = create_cb(
-        tt::CB::c_out0,
+        tt::CBIndex::c_16,
         program,
         all_cores,
         output_single_tile_size,
@@ -742,7 +743,7 @@ operation::ProgramWithCallbacks untilize_single_core(
                               .set_page_size(src0_cb_index, input_single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = num_tiles_per_block;
     auto cb_output_config = tt::tt_metal::CircularBufferConfig(
                                 num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp
index 6dbfd49f0aa..53b04fb7a36 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize/untilize.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 ttnn::Tensor ExecuteUntilize::invoke(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
index 8c3f07035de..b954358b04d 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_op.cpp
@@ -8,6 +8,8 @@
 #include "tt_metal/common/work_split.hpp"
 #include "untilize_with_halo_v2_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 void UntilizeWithHaloV2::validate(const std::vector<Tensor>& input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
index 7699c7a3403..3960311c0aa 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/untilize_with_halo_v2_program_factory.cpp
@@ -15,6 +15,7 @@
 #include "ttnn/operation.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement::detail {
 
@@ -68,10 +69,10 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
     // Construct CBs
     // //
 
-    uint32_t src_cb_id = tt::CB::c_in0;
-    uint32_t pad_cb_id = tt::CB::c_in1;
-    uint32_t untilize_out_cb_id = tt::CB::c_out0;
-    uint32_t out_cb_id = tt::CB::c_out1;
+    uint32_t src_cb_id = tt::CBIndex::c_0;
+    uint32_t pad_cb_id = tt::CBIndex::c_1;
+    uint32_t untilize_out_cb_id = tt::CBIndex::c_16;
+    uint32_t out_cb_id = tt::CBIndex::c_17;
 
     // input CB (sharded)
     auto src_cb_config = CircularBufferConfig(input_npages * in_page_size, {{src_cb_id, in_df}})
@@ -111,9 +112,9 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
     log_debug(tt::LogOp, "CB {} :: npages = {}, pagesize = {}", pad_cb_id, pad_cb_npages, pad_cb_pagesize);
 
     // Additional CBs for sharded data kernel configs
-    uint32_t padding_config_cb_id = tt::CB::c_in2;
-    uint32_t local_config_cb_id = tt::CB::c_in3;
-    uint32_t remote_config_cb_id = tt::CB::c_in4;
+    uint32_t padding_config_cb_id = tt::CBIndex::c_2;
+    uint32_t local_config_cb_id = tt::CBIndex::c_3;
+    uint32_t remote_config_cb_id = tt::CBIndex::c_4;
 
     tt::DataFormat kernel_config_df = tt::DataFormat::RawUInt16;  // NOTE: UInt16 is not supported for CB types
     uint32_t config_nbytes =
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.cpp
index d691bdd954b..55a5bb43539 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/untilize_with_halo_v2.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 ttnn::Tensor ExecuteUntilizeWithHaloV2::invoke(
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp
index 4143b8dfa26..13ee19124cf 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp
@@ -17,7 +17,6 @@ void UntilizeWithUnpadding::validate(const std::vector<Tensor>& input_tensors) c
     TT_FATAL(input_tensor_a.get_layout() == Layout::TILE, "Can only untilize tile major data");
 
     TT_FATAL(input_tensor_a.volume() % tt::constants::TILE_HW == 0, "Error");
-    TT_FATAL(((this->output_tensor_end[-1] + 1) % 2 == 0), "Can only unpad to row major tensor of even width");
 
     if (input_tensor_a.memory_config().is_sharded()) {
         if (input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED) {
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
index 92964fbbf0d..7f8c59b9f12 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
@@ -16,6 +16,7 @@
 #include "ttnn/operation.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::data_movement::detail {
 
@@ -104,7 +105,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_single_core(
                               .set_page_size(src0_cb_index, input_single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = num_tiles_per_block;
     auto cb_output_config = tt::tt_metal::CircularBufferConfig(
                                 num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
@@ -230,8 +231,8 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved(
     uint32_t padded_row_size_bytes = input_shape[-1] * a.element_size();     // Assuming bfloat16 dataformat
     uint32_t unpadded_row_size_bytes = output_shape[-1] * a.element_size();  // Assuming bfloat16 dataformat
 
-    create_cb(tt::CB::c_in0, program, all_cores, input_single_tile_size, num_tiles_per_row, input_cb_data_format);
-    create_cb(tt::CB::c_out0, program, all_cores, output_single_tile_size, num_tiles_per_row, output_cb_data_format);
+    create_cb(tt::CBIndex::c_0, program, all_cores, input_single_tile_size, num_tiles_per_row, input_cb_data_format);
+    create_cb(tt::CBIndex::c_16, program, all_cores, output_single_tile_size, num_tiles_per_row, output_cb_data_format);
 
     Buffer* src0_buffer = a.buffer();
     Buffer* dst_buffer = output.buffer();
@@ -436,7 +437,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
 
     uint32_t num_input_tiles = ntiles_per_block * nblocks_per_core;
     auto [src0_cb_index, cb_src0] = create_cb(
-        tt::CB::c_in0,
+        tt::CBIndex::c_0,
         program,
         all_cores,
         input_single_tile_size,
@@ -446,17 +447,17 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
 
     uint32_t num_output_tiles = out_sharded ? (unpad_tensor_w_16 ? 16 : ntiles_per_batch * 2) : ntiles_per_block * 2;
     auto [output_cb_index, cb_output] =
-        create_cb(tt::CB::c_out0, program, all_cores, output_single_tile_size, num_output_tiles, output_cb_data_format);
+        create_cb(tt::CBIndex::c_16, program, all_cores, output_single_tile_size, num_output_tiles, output_cb_data_format);
 
     auto [sharded_output_cb_index, cb_sharded_output] = out_sharded ? create_cb(
-                                                                          tt::CB::c_out1,
+                                                                          tt::CBIndex::c_17,
                                                                           program,
                                                                           all_cores,
                                                                           block_row_size,
                                                                           num_output_rows_unpadded,
                                                                           output_cb_data_format,
                                                                           output.buffer())
-                                                                    : std::make_tuple(tt::CB::c_out1, CBHandle{});
+                                                                    : std::make_tuple(tt::CBIndex::c_17, CBHandle{});
 
     Buffer* src0_buffer = a.buffer();
     Buffer* dst_buffer = output.buffer();
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp
index 6e9f899ac7d..e9b16bfc104 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::data_movement {
 
 ttnn::Tensor ExecuteUntilizeWithUnpadding::invoke(
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
index 6481f71d227..e6d1c061ed7 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
@@ -250,6 +250,11 @@ struct ExecutePrelu
         const Tensor& input_tensor_b,
         const std::optional<MemoryConfig>& memory_config = std::nullopt);
 
+    static Tensor invoke(
+        const Tensor& input_tensor,
+        const std::array<float, 1>& weight,
+        const std::optional<MemoryConfig>& memory_config = std::nullopt);
+
     static Tensor invoke(
         const Tensor& input_tensor,
         float scalar,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
index 8be9e9ef579..4fcf084c781 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_pybind.hpp
@@ -497,6 +497,104 @@ void bind_binary_composite_overload(
             py::arg("memory_config") = std::nullopt});
 }
 
+template <typename binary_operation_t>
+void bind_prelu(
+    py::module& module,
+    const binary_operation_t& operation,
+    const std::string& description,
+    const std::string& supported_dtype = "BFLOAT16",
+    const std::string& supported_rank = "2, 3, 4",
+    const std::string& example_tensor1 = "ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), layout=ttnn.TILE_LAYOUT, device=device)",
+    const std::string& example_tensor2 = "ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), layout=ttnn.TILE_LAYOUT, device=device)",
+    const std::string& note="") {
+    auto doc = fmt::format(
+        R"doc(
+        {2}
+
+        .. math::
+            \mathrm{{output\_tensor}} = \verb|{0}|(\mathrm{{input\_tensor\_a,input\_tensor\_b}})
+
+        Args:
+            input_tensor_a (ttnn.Tensor): the input tensor.
+            input_tensor_b (ttnn.Tensor or List[float] of length 1 or Number): weight.
+
+        Keyword Args:
+            memory_config (ttnn.MemoryConfig, optional): memory configuration for the operation. Defaults to `None`.
+
+        Returns:
+            ttnn.Tensor: the output tensor.
+
+        Note:
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - {3}
+                 - TILE
+                 - {4}
+
+            {7}
+
+        Example:
+            >>> tensor1 = {5}
+            >>> tensor2 = {6}
+            >>> output = {1}(tensor1, tensor2/scalar)
+        )doc",
+        operation.base_name(),
+        operation.python_fully_qualified_name(),
+        description,
+        supported_dtype,
+        supported_rank,
+        example_tensor1,
+        example_tensor2,
+        note);
+
+    bind_registered_operation(
+        module,
+        operation,
+        doc,
+        ttnn::pybind_overload_t{
+            [](const binary_operation_t& self,
+               const Tensor& input_tensor_a,
+               const Tensor& input_tensor_b,
+               const std::optional<MemoryConfig>& memory_config) {
+                    return self(input_tensor_a, input_tensor_b, memory_config);
+                },
+            py::arg("input_tensor_a"),
+            py::arg("weight"),
+            py::kw_only(),
+            py::arg("memory_config") = std::nullopt},
+
+        ttnn::pybind_overload_t{
+            [](const binary_operation_t& self,
+               const Tensor& input_tensor_a,
+               float value,
+               const std::optional<MemoryConfig>& memory_config) {
+                    return self(input_tensor_a, value, memory_config);
+                },
+            py::arg("input_tensor_a"),
+            py::arg("weight"),
+            py::kw_only(),
+            py::arg("memory_config") = std::nullopt},
+
+        ttnn::pybind_overload_t{
+            [](const binary_operation_t& self,
+               const Tensor& input_tensor_a,
+               const std::array<float, 1> &weight,
+               const std::optional<MemoryConfig>& memory_config) {
+                    return self(input_tensor_a, weight, memory_config);
+                },
+            py::arg("input_tensor_a"),
+            py::arg("weight"),
+            py::kw_only(),
+            py::arg("memory_config") = std::nullopt}
+    );
+}
+
 template <typename binary_operation_t>
 void bind_div(py::module& module, const binary_operation_t& operation, const std::string& description, const std::string& math) {
     auto doc = fmt::format(
@@ -1182,14 +1280,15 @@ void py_module(py::module& module) {
         R"doc(Computes maximum for :attr:`input_tensor_a` and :attr:`input_tensor_b` and returns the tensor with the same layout as :attr:`input_tensor_a`)doc",
         R"doc(BFLOAT16, BFLOAT8_B)doc");
 
-    detail::bind_binary_composite_overload(
+    detail::bind_prelu(
         module,
         ttnn::prelu,
-        R"doc(Perform an eltwise-prelu operation. PReLU supports the case where the size of input_tensor_b matches the number of channels in input_tensor_a.)doc",
+        R"doc(Perform an eltwise-prelu operation.)doc",
         R"doc(BFLOAT16, BFLOAT8_B)doc",
         R"doc(2, 3, 4, 5)doc",
         R"doc(ttnn.from_torch(torch.rand([1, 2, 32, 32], dtype=torch.bfloat16), device=device))doc",
-        R"doc(ttnn.from_torch(torch.tensor([1, 2], dtype=torch.bfloat16), device=device))doc");
+        R"doc(ttnn.from_torch(torch.tensor([1, 2], dtype=torch.bfloat16), device=device))doc",
+        R"doc(PReLU supports the case where weight is a scalar or 1D list/array of size=1 or a 1D tensor :attr:`input_tensor_b` of size = the second dimension in :attr:`input_tensor_a`)doc");
 
     detail::bind_binary_composite(
         module,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
index f0cee3b7eb8..51d65dac416 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
@@ -271,8 +271,13 @@ Tensor _div_no_nan(const Tensor& input_a, const Tensor& input_b, const std::opti
     return ttnn::where(ttnn::eqz(input_b, output_mem_config), 0, div_result);
 }
 
-Tensor ExecutePrelu::invoke(const Tensor& input, float scalar, const std::optional<MemoryConfig>& output_mem_config) {
-    return ttnn::prelu_sfpu(input, scalar);
+Tensor ExecutePrelu::invoke(const Tensor& input, float weight, const std::optional<MemoryConfig>& output_mem_config) {
+    return ttnn::prelu_sfpu(input, weight);
+}
+
+Tensor ExecutePrelu::invoke(const Tensor& input, const std::array<float, 1>& weight, const std::optional<MemoryConfig>& output_mem_config) {
+    float scalar_weight = weight[0];
+    return ttnn::prelu_sfpu(input, scalar_weight);
 }
 
 Tensor ExecutePrelu::invoke(const Tensor& input_a, const Tensor& input_b, const std::optional<MemoryConfig>& output_mem_config) {
@@ -286,6 +291,7 @@ Tensor ExecutePrelu::invoke(const Tensor& input_a, const Tensor& input_b, const
         reshape[1] = s_a[1];
         b = ttnn::reshape(input_b, ttnn::Shape(reshape));
     }
+
     Tensor result = ttnn::where(ttnn::ltz(input_a, output_mem_config), ttnn::multiply(input_a, b), input_a);
     return result;
 }
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
index e7623a0ee26..0d91baae93a 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp
@@ -117,14 +117,14 @@ BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::create(
     uint32_t num_input_tiles_cb0 = src0_sharded ? num_tiles_per_shard : num_input_tiles;
 
     auto* cb_src0_buffer = src0_sharded ? src0_buffer : nullptr;
-    auto [cb_src0, cb_handle_src0] = create_cb(tt::CB::c_in0, program, all_device_cores, src0_single_tile_size, num_input_tiles_cb0, src0_cb_data_format, cb_src0_buffer);
+    auto [cb_src0, cb_handle_src0] = create_cb(tt::CBIndex::c_0, program, all_device_cores, src0_single_tile_size, num_input_tiles_cb0, src0_cb_data_format, cb_src0_buffer);
 
     uint32_t num_input_tiles_cb1 = src1_buffer != nullptr ? num_input_tiles : 1;
-    create_cb(tt::CB::c_in1, program, all_device_cores, src1_single_tile_size, num_input_tiles_cb1, src1_cb_data_format);
+    create_cb(tt::CBIndex::c_1, program, all_device_cores, src1_single_tile_size, num_input_tiles_cb1, src1_cb_data_format);
 
     uint32_t num_output_tiles = output_sharded ? num_tiles_per_shard : 2;
     auto* cb_output_buffer = output_sharded ? dst_buffer : nullptr;
-    auto [cb_output, cb_handle_output] = create_cb(tt::CB::c_out0, program, all_device_cores, dst_single_tile_size, num_output_tiles, dst_cb_data_format, cb_output_buffer);
+    auto [cb_output, cb_handle_output] = create_cb(tt::CBIndex::c_2, program, all_device_cores, dst_single_tile_size, num_output_tiles, dst_cb_data_format, cb_output_buffer);
 
     auto src0_is_dram = static_cast<uint32_t>(src0_buffer->buffer_type() == tt_metal::BufferType::DRAM);
     auto dst_is_dram = static_cast<uint32_t>(dst_buffer->buffer_type() == tt_metal::BufferType::DRAM);
@@ -145,14 +145,14 @@ BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::create(
         auto src1_is_dram = static_cast<uint32_t>(src1_buffer->buffer_type() == tt_metal::BufferType::DRAM);
         binary_reader_kernel_id = tt_metal::CreateKernel(
             program,
-            "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/"
+            "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/"
             "reader_bcast_hw_interleaved_partitioned.cpp",
             all_device_cores,
             tt_metal::ReaderDataMovementConfig({src0_is_dram, src1_is_dram}, reader_defines));
     } else {
         binary_reader_kernel_id = tt_metal::CreateKernel(
             program,
-            "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/"
+            "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/"
             "reader_bcast_scalar_interleaved_partitioned.cpp",
             all_device_cores,
             tt_metal::ReaderDataMovementConfig({src0_is_dram}, reader_defines));
@@ -170,7 +170,7 @@ BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::create(
 
     auto bcast_kernel_id = tt_metal::CreateKernel(
         program,
-        "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_hw.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_hw.cpp",
         all_device_cores,
         tt_metal::ComputeConfig{.compile_args = {}, .defines = bcast_compute_defines});
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
index a32598a6bd8..c8f0d0c9ea4 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_program_factory.cpp
@@ -93,7 +93,7 @@ BinaryDeviceOperation ::BroadcastHeightMultiCore::create(
     auto dst_buffer = output.buffer();
     TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 2;
 
     tt_metal::CircularBufferConfig src0_cb_config =
@@ -101,13 +101,13 @@ BinaryDeviceOperation ::BroadcastHeightMultiCore::create(
             .set_page_size(src0_cb_index, src0_single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_device_cores, src0_cb_config);
 
-    uint32_t src1_cb_index = 1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     tt_metal::CircularBufferConfig src1_cb_config =
         tt_metal::CircularBufferConfig(num_input_tiles * src1_single_tile_size, {{src1_cb_index, src1_cb_data_format}})
             .set_page_size(src1_cb_index, src1_single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, src1_cb_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig output_cb_config =
         tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
@@ -129,14 +129,14 @@ BinaryDeviceOperation ::BroadcastHeightMultiCore::create(
 
     KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel(
         program,
-        "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/writer_unary_interleaved_input_cols_batched.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/writer_unary_interleaved_input_cols_batched.cpp",
         all_device_cores,
         tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
     std::map<std::string, std::string> bcast_defines = bcast_op_utils::get_defines(BcastOpDim::H, bcast_math);
     auto bcast_kernel_id = tt_metal::CreateKernel(
         program,
-        "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h.cpp",
         all_device_cores,
         tt_metal::ComputeConfig{.compile_args = {}, .defines = bcast_defines});
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
index b52c1dd8bbf..af3a9ece43e 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp
@@ -118,7 +118,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create(
         (shard_spec.shape[0] % TILE_HEIGHT == 0) && (shard_spec.shape[0] % TILE_WIDTH == 0),
         "Shard shapes must be multiple of TILE_HEIGHT ");
 
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t aligned_input_tile_nbytes =
         round_up_to_mul32(input_tile_size);  // will have issue if the page is not multiple of 32
     uint32_t in_cb_pagesize = aligned_input_tile_nbytes;
@@ -128,7 +128,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create(
             .set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, src0_cb_config);
 
-    uint32_t output_cb_index = CB::c_out0;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     tt_metal::CircularBufferConfig output_cb_config =
         tt_metal::CircularBufferConfig(aligned_input_tile_nbytes * num_tile_per_core, {{output_cb_index, out_df}})
             .set_page_size(output_cb_index, in_cb_pagesize)
@@ -139,7 +139,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create(
     uint32_t w_blk = std::min(Wt, 8u);
 
     uint32_t num_input_tiles = w_blk;
-    uint32_t src1_cb_index = CB::c_in1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     tt_metal::CircularBufferConfig src1_cb_config =
         tt_metal::CircularBufferConfig(num_input_tiles * input1_tile_size, {{src1_cb_index, b_df}})
             .set_page_size(src1_cb_index, input1_tile_size);
@@ -164,7 +164,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create(
     // const char* compute_name = bcast_op_utils::get_compute_name(BcastOpDim::H));
     auto bcast_kernel_id = tt_metal::CreateKernel(
         program,
-        "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h_sharded_optimised.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h_sharded_optimised.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = {}, .defines = bcast_defines});
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp
index c1ea1a028f2..1cb84b83721 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_program_factory.cpp
@@ -102,7 +102,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreSharded::create(
 
     TT_ASSERT((shard_spec.shape[0] % TILE_HEIGHT == 0) && (shard_spec.shape[0] % TILE_WIDTH == 0), "Shard shapes must be multiple of TILE_HEIGHT ");
 
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t aligned_input_tile_nbytes = round_up_to_mul32(input_tile_size); //will have issue if the page is not multiple of 32
     uint32_t in_cb_pagesize = aligned_input_tile_nbytes;
     tt_metal::CircularBufferConfig src0_cb_config = tt_metal::CircularBufferConfig(aligned_input_tile_nbytes * num_tile_per_core,  {{src0_cb_index, act_df}})
@@ -110,7 +110,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreSharded::create(
                                           .set_globally_allocated_address(*a.buffer());
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, src0_cb_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(aligned_input_tile_nbytes * num_tile_per_core,
                                           {{output_cb_index, out_df}})
                                           .set_page_size(output_cb_index, in_cb_pagesize)
@@ -118,7 +118,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreSharded::create(
     auto out_cb = tt_metal::CreateCircularBuffer(program, all_cores, output_cb_config);
 
     uint32_t num_input_tiles = (b->get_legacy_shape()[-1] * output.element_size() + TILE_HW - 1)/ TILE_HW;
-    uint32_t src1_cb_index = CB::c_in1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     tt_metal::CircularBufferConfig src1_cb_config = tt_metal::CircularBufferConfig(num_input_tiles * input1_tile_size, {{src1_cb_index, b_df}})
         .set_page_size(src1_cb_index, input1_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, src1_cb_config);
@@ -142,7 +142,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreSharded::create(
     //const char* compute_name = bcast_op_utils::get_compute_name(BcastOpDim::H));
     auto bcast_kernel_id = tt_metal::CreateKernel(
         program,
-        "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_h.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h.cpp",
         all_cores,
         tt_metal::ComputeConfig{.compile_args = {}, .defines = bcast_defines}
     );
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
index ab0f77ddde8..3cfb7ebca27 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_width_multi_core_program_factory.cpp
@@ -92,7 +92,7 @@ BinaryDeviceOperation::BroadcastWidthMultiCore::cached_program_t BinaryDeviceOpe
     auto dst_buffer = output.buffer();
     TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 2;
 
     tt_metal::CircularBufferConfig src0_cb_config =
@@ -100,13 +100,13 @@ BinaryDeviceOperation::BroadcastWidthMultiCore::cached_program_t BinaryDeviceOpe
             .set_page_size(src0_cb_index, src0_single_tile_size);
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_device_cores, src0_cb_config);
 
-    uint32_t src1_cb_index = 1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     tt_metal::CircularBufferConfig src1_cb_config =
         tt_metal::CircularBufferConfig(num_input_tiles * src1_single_tile_size, {{src1_cb_index, src1_cb_data_format}})
             .set_page_size(src1_cb_index, src1_single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_device_cores, src1_cb_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig output_cb_config =
         tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
@@ -122,20 +122,20 @@ BinaryDeviceOperation::BroadcastWidthMultiCore::cached_program_t BinaryDeviceOpe
 
     KernelHandle binary_reader_kernel_id = tt_metal::CreateKernel(
         program,
-        "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/reader_bcast_w_interleaved_input_cols_partitioned.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_w_interleaved_input_cols_partitioned.cpp",
         all_device_cores,
         tt_metal::ReaderDataMovementConfig(reader_compile_time_args));
 
     KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel(
         program,
-        "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/dataflow/writer_unary_interleaved_input_cols_batched.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/writer_unary_interleaved_input_cols_batched.cpp",
         all_device_cores,
         tt_metal::WriterDataMovementConfig(writer_compile_time_args));
 
     std::map<std::string, std::string> bcast_defines = bcast_op_utils::get_defines(BcastOpDim::W, bcast_math);
     auto bcast_kernel_id = tt_metal::CreateKernel(
         program,
-        "ttnn/cpp/ttnn/operations/data_movement/bcast/device/kernels/compute/bcast_w.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_w.cpp",
         all_device_cores,
         tt_metal::ComputeConfig{.compile_args = {}, .defines = bcast_defines});
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
index 5045954dd29..ad12557c284 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp
@@ -342,7 +342,7 @@ BinaryDeviceOperation::ElementWiseMultiCore::cached_program_t BinaryDeviceOperat
 
     auto all_device_cores = CoreRange({0, 0}, {num_cores_x - 1, num_cores_y - 1});
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = src0_sharded ? num_tiles_per_shard : 2 * max_block_size;
     tt_metal::CircularBufferConfig cb_src0_config =
         tt_metal::CircularBufferConfig(num_input_tiles * src0_single_tile_size, {{src0_cb_index, src0_cb_data_format}})
@@ -352,7 +352,7 @@ BinaryDeviceOperation::ElementWiseMultiCore::cached_program_t BinaryDeviceOperat
     }
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_src0_config);
 
-    uint32_t src1_cb_index = 1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     num_input_tiles = src1_sharded ? num_tiles_per_shard : 2 * max_block_size;
     tt_metal::CircularBufferConfig cb_src1_config =
         tt_metal::CircularBufferConfig(num_input_tiles * src1_single_tile_size, {{src1_cb_index, src1_cb_data_format}})
@@ -372,8 +372,8 @@ BinaryDeviceOperation::ElementWiseMultiCore::cached_program_t BinaryDeviceOperat
         }
         uint32_t interim0_single_tile_size = tt_metal::detail::TileSize(interim_cb0_format);
         tt_metal::CircularBufferConfig cb_interm_config =
-            tt_metal::CircularBufferConfig(max_block_size * interim0_single_tile_size, {{CB::c_intermed0, interim_cb0_format}})
-                .set_page_size(CB::c_intermed0, interim0_single_tile_size);
+            tt_metal::CircularBufferConfig(max_block_size * interim0_single_tile_size, {{tt::CBIndex::c_3, interim_cb0_format}})
+                .set_page_size(tt::CBIndex::c_3, interim0_single_tile_size);
         auto cb_interm = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm_config);
     }
     if (eltwise_defines.find("SFPU_OP_INIT_PRE_IN1_0") != eltwise_defines.end()) {
@@ -383,12 +383,12 @@ BinaryDeviceOperation::ElementWiseMultiCore::cached_program_t BinaryDeviceOperat
         }
         uint32_t interim1_single_tile_size = tt_metal::detail::TileSize(interim_cb1_format);
         tt_metal::CircularBufferConfig cb_interm2_config =
-            tt_metal::CircularBufferConfig(max_block_size * interim1_single_tile_size, {{CB::c_intermed1, interim_cb1_format}})
-                .set_page_size(CB::c_intermed1, interim1_single_tile_size);
+            tt_metal::CircularBufferConfig(max_block_size * interim1_single_tile_size, {{tt::CBIndex::c_4, interim_cb1_format}})
+                .set_page_size(tt::CBIndex::c_4, interim1_single_tile_size);
         auto cb_interm2 = tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm2_config);
     }
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     uint32_t num_output_tiles = (out_sharded || block_or_width_sharded) ? num_tiles_per_shard : 2 * max_block_size;
     tt_metal::CircularBufferConfig cb_output_config =
         tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h.cpp
new file mode 100644
index 00000000000..619dfbed4b8
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h.cpp
@@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/bcast.h"
+
+
+namespace NAMESPACE {
+void MAIN {
+    constexpr uint32_t onetile = 1;
+    uint32_t B = get_arg_val<uint32_t>(0);
+    uint32_t Ht = get_arg_val<uint32_t>(1);
+    uint32_t Wt = get_arg_val<uint32_t>(2);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_2);
+
+    for (uint32_t b = 0; b < B; b++) {
+    for (uint32_t h = 0; h < Ht; h++) {
+    for (uint32_t w = 0; w < Wt; w++) {
+        // For this bcast-h op the reader will wrap the RHS source tile around at Wt
+        // so here we just linearly read 2 parallel arrays and apply bcast op per tile
+        // (bcast_h propagates the op down the H dimension, so it can be though of as bcast to H)
+        cb_wait_front(tt::CBIndex::c_1, onetile);
+
+        cb_reserve_back(tt::CBIndex::c_2, onetile);
+
+        acquire_dst();
+
+        cb_wait_front(tt::CBIndex::c_0, onetile);
+
+        BCAST_OP<BroadcastType::ROW>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_2);
+
+        cb_pop_front(tt::CBIndex::c_0, onetile);
+
+        release_dst();
+
+        cb_push_back(tt::CBIndex::c_2, onetile);
+        cb_pop_front(tt::CBIndex::c_1, onetile);
+    } } }
+}
+} // NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h_sharded_optimised.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h_sharded_optimised.cpp
new file mode 100644
index 00000000000..3771417f522
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_h_sharded_optimised.cpp
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/bcast.h"
+
+
+namespace NAMESPACE {
+void MAIN {
+    constexpr uint32_t onetile = 1;
+    uint32_t NC = get_arg_val<uint32_t>(0);
+    uint32_t Ht = get_arg_val<uint32_t>(1);
+    uint32_t Wt = get_arg_val<uint32_t>(2);
+    uint32_t h_blk = get_arg_val<uint32_t>(3);
+    uint32_t batch_b = get_arg_val<uint32_t>(4);
+    uint32_t Ht_per_batch_b = get_arg_val<uint32_t>(5);
+
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_2);
+
+    cb_wait_front(tt::CBIndex::c_0, Wt*Ht);
+    cb_reserve_back(tt::CBIndex::c_2, Wt*Ht);
+    uint32_t b_offset = 0;
+    for (uint32_t bn = 0; bn < batch_b; bn++) {
+        for (uint32_t wt = 0; wt < Wt; wt++) {
+            cb_wait_front(tt::CBIndex::c_1, onetile);
+            for (uint32_t ht = 0; ht < Ht_per_batch_b; ht+=h_blk) {
+                acquire_dst();
+                for (uint32_t htr = 0; htr<h_blk; htr++) {
+                    uint32_t current_index = b_offset + (ht + htr) * Wt + wt;
+                    BCAST_OP<BroadcastType::ROW>(tt::CBIndex::c_0, tt::CBIndex::c_1, current_index, 0, htr);
+                    pack_tile<true>(htr, tt::CBIndex::c_2, current_index);
+                }
+                release_dst();
+            }
+            cb_pop_front(tt::CBIndex::c_1, onetile);
+        }
+        b_offset += Ht_per_batch_b * Wt;
+    }
+    cb_pop_front(tt::CBIndex::c_0, Wt*Ht);
+    cb_push_back(tt::CBIndex::c_2, Wt*Ht);
+}
+} // NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_hw.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_hw.cpp
new file mode 100644
index 00000000000..cff6024d060
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_hw.cpp
@@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/bcast.h"
+
+namespace NAMESPACE {
+void MAIN {
+    constexpr uint32_t onetile = 1;
+    uint32_t B = get_arg_val<uint32_t>(0);
+    uint32_t Ht = get_arg_val<uint32_t>(1);
+    uint32_t Wt = get_arg_val<uint32_t>(2);
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_2);
+
+    #ifdef BCAST_SCALAR
+    cb_wait_front(tt::CBIndex::c_1, onetile);
+    #endif
+
+    for (uint32_t b = 0; b < B; b++) {
+    for (uint32_t h = 0; h < Ht; h++) {
+    for (uint32_t w = 0; w < Wt; w++) {
+        #ifndef BCAST_SCALAR
+        cb_wait_front(tt::CBIndex::c_1, onetile);
+        #endif
+        cb_reserve_back(tt::CBIndex::c_2, onetile);
+
+        acquire_dst();
+
+        cb_wait_front(tt::CBIndex::c_0, onetile);
+
+        BCAST_OP<BroadcastType::SCALAR>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+        pack_tile(0, tt::CBIndex::c_2);
+
+        cb_pop_front(tt::CBIndex::c_0, onetile);
+        #ifndef BCAST_SCALAR
+        cb_pop_front(tt::CBIndex::c_1, onetile);
+        #endif
+        release_dst();
+
+        cb_push_back(tt::CBIndex::c_2, onetile);
+    } } }
+
+}
+} // NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_w.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_w.cpp
new file mode 100644
index 00000000000..8475a47cd1f
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/bcast_w.cpp
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+
+#include "compute_kernel_api/bcast.h"
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t w = 0;
+    constexpr uint32_t onetile = 1;
+    uint32_t B = get_arg_val<uint32_t>(0);
+    uint32_t Ht = get_arg_val<uint32_t>(1);
+    uint32_t Wt = get_arg_val<uint32_t>(2);
+
+    init_bcast<BCAST_LLKOP, BCAST_DIM>(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_2);
+
+    for (uint32_t b = 0; b < B; b++) {
+    for (uint32_t h = 0; h < Ht; h++) {
+        cb_wait_front(tt::CBIndex::c_1, onetile);
+        for (uint32_t w = 0; w < Wt; w++) {
+
+            cb_reserve_back(tt::CBIndex::c_2, onetile);
+
+            acquire_dst();
+
+            cb_wait_front(tt::CBIndex::c_0, onetile);
+            BCAST_OP<BroadcastType::COL>(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
+            pack_tile(0, tt::CBIndex::c_2);
+            cb_pop_front(tt::CBIndex::c_0, onetile);
+
+            release_dst();
+
+            cb_push_back(tt::CBIndex::c_2, onetile);
+
+        }
+        cb_pop_front(tt::CBIndex::c_1, onetile);
+    }}
+}
+} // NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp
index 0b33d79f593..f88b15ca9ef 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp
@@ -15,21 +15,21 @@ void MAIN {
     uint32_t per_core_block_cnt = get_arg_val<uint32_t>(0);
     uint32_t per_core_block_size = get_arg_val<uint32_t>(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
 
     #ifdef SFPU_OP_INIT_PRE_IN0_0
-        constexpr auto cb_inp0 = tt::CB::c_intermed0;
+        constexpr auto cb_inp0 = tt::CBIndex::c_3;
     #else
         constexpr auto cb_inp0 = cb_in0;
     #endif
 
     #ifdef SFPU_OP_INIT_PRE_IN1_0
-        constexpr auto cb_inp1 = tt::CB::c_intermed1;
+        constexpr auto cb_inp1 = tt::CBIndex::c_4;
     #else
         constexpr auto cb_inp1 = cb_in1;
     #endif
-    constexpr auto cb_out0 =  tt::CB::c_out0;
+    constexpr auto cb_out0 =  tt::CBIndex::c_2;
 
     binary_op_init_common(cb_inp0, cb_inp1, cb_out0);
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_interleaved.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_interleaved.cpp
new file mode 100644
index 00000000000..7001ecf9034
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_interleaved.cpp
@@ -0,0 +1,80 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+    uint32_t nc1        = get_arg_val<uint32_t>(12); // if 1 we expect the bcast tensor to have NC=1
+
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat data_format = get_dataformat(cb_id_in0);
+
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i = 0;
+    uint32_t i1 = 0;
+    for (uint32_t nc = 0; nc < NC; nc++) {
+        for (uint32_t ht = 0; ht < Ht; ht++) {
+            for (uint32_t wt = 0; wt < Wt; wt++) {
+                cb_reserve_back(cb_id_in0, onetile);
+                l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                noc_async_read_tile(i, s0, l1_write_addr_in0);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, onetile);
+
+                // for each W-tile of the first tensor we push one tile from the second arg tile list
+                // but we loop the second list around
+                cb_reserve_back(cb_id_in1, onetile);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read_tile(i1, s1, l1_write_addr_in1);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                i1 ++;
+                i ++; // input tile iterates over NC Ht Wt
+            }
+
+            // bcast tensor should be NC1W (actually NC32W padded with 0s in H)
+            // wrap W around for each h (broadcast)
+            i1 -= Wt;
+        }
+        // we reused Wt tiles out of NCWt bcast tensor Ht times, now advance for next NC
+        if (nc1 == 0) // if bcast NC==1 we don't advance but reuse the tensor
+            i1 += Wt;
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_interleaved_input_rows_partitioned.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_interleaved_input_rows_partitioned.cpp
new file mode 100644
index 00000000000..67abb200068
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_interleaved_input_rows_partitioned.cpp
@@ -0,0 +1,87 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+    uint32_t nc1        = get_arg_val<uint32_t>(12); // if 1 we expect the bcast tensor to have NC=1
+    uint32_t start_id   = get_arg_val<uint32_t>(13);
+    uint32_t HtWt       = get_arg_val<uint32_t>(14); // HtWt of input tensor
+
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat in1_data_format = get_dataformat(cb_id_in1);
+
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = in1_tile_bytes,
+        .data_format = in1_data_format
+    };
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i = 0;
+    uint32_t i1 = 0;
+    uint32_t i_nc = 0;
+    for (uint32_t nc = 0; nc < NC; nc++) {
+        i = i_nc + start_id;
+        for (uint32_t ht = 0; ht < Ht; ht++) {
+            for (uint32_t wt = 0; wt < Wt; wt++) {
+                cb_reserve_back(cb_id_in0, onetile);
+                l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                noc_async_read_tile(i, s0, l1_write_addr_in0);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, onetile);
+
+                // for each W-tile of the first tensor we push one tile from the second arg tile list
+                // but we loop the second list around
+                cb_reserve_back(cb_id_in1, onetile);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read_tile(i1, s1, l1_write_addr_in1);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                i1 ++;
+                i ++; // input tile iterates over NC Ht Wt
+            }
+
+            // bcast tensor should be NC1W (actually NC32W padded with 0s in H)
+            // wrap W around for each h (broadcast)
+            i1 -= Wt;
+        }
+        // we reused Wt tiles out of NCWt bcast tensor Ht times, now advance for next NC
+        if (nc1 == 0) // if bcast NC==1 we don't advance but reuse the tensor
+            i1 += Wt;
+        i_nc += HtWt;
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_sharded.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_sharded.cpp
new file mode 100644
index 00000000000..4ff8d62f6ca
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_sharded.cpp
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src1_addr  = get_arg_val<uint32_t>(0);
+    uint32_t Ht         = get_arg_val<uint32_t>(1);
+    uint32_t Wt         = get_arg_val<uint32_t>(2);
+    uint32_t offset         = get_arg_val<uint32_t>(3);
+    uint32_t NC         =  get_arg_val<uint32_t>(4);
+    uint32_t batch_offset=  get_arg_val<uint32_t>(5); //if weight has multiple batches
+
+    //constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+    constexpr uint32_t cb_id_in0 = get_compile_time_arg_val(0);
+
+    //constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat data_format = get_dataformat(cb_id_in1);
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t i = 0;
+    cb_push_back(cb_id_in0, Ht * Wt);
+    for (uint32_t ht = 0; ht < Ht; ht++) {
+        for (uint32_t wt = 0; wt < Wt; wt++) {
+                // for each W-tile of the first tensor we push one tile from the second arg tile list
+                // but we loop the second list around
+                cb_reserve_back(cb_id_in1, onetile);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read_tile(offset, s1, l1_write_addr_in1);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                offset ++;
+            }
+
+
+            // bcast tensor should be NC1W (actually NC32W padded with 0s in H)
+            // wrap W around for each h (broadcast)
+            offset -= Wt;
+            if(ht % NC == (NC -1)){
+                offset += batch_offset; //switching to next batch
+            }
+        }
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_sharded_optimised.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_sharded_optimised.cpp
new file mode 100644
index 00000000000..1fb87475132
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_h_sharded_optimised.cpp
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+<<<<<<< HEAD
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_16 below.
+=======
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+>>>>>>> 500923c2b7... #7493: Updating some ops to use c_2 instead of c_16 given the dependency on eltwise
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src1_addr    = get_arg_val<uint32_t>(0);
+    uint32_t Ht           = get_arg_val<uint32_t>(1);
+    uint32_t Wt           = get_arg_val<uint32_t>(2);
+    uint32_t offset       = get_arg_val<uint32_t>(3);
+    uint32_t batch_offset = get_arg_val<uint32_t>(4); //if weight has multiple batches
+    uint32_t w_blk        = get_arg_val<uint32_t>(5);
+    uint32_t batch_b      = get_arg_val<uint32_t>(6);
+
+    //constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+    constexpr uint32_t cb_id_in0 = get_compile_time_arg_val(0);
+
+    //constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat data_format = get_dataformat(cb_id_in1);
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    cb_push_back(cb_id_in0, Ht * Wt);
+    for (uint32_t b = 0; b < batch_b; b ++) {
+        for (uint32_t wt = 0; wt < Wt; wt += w_blk) {
+            cb_reserve_back(cb_id_in1, w_blk);
+            l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+            for (uint32_t r = 0; r<w_blk; r++) {
+                noc_async_read_tile(offset + wt + r, s1, l1_write_addr_in1);
+                l1_write_addr_in1 += tile_bytes;
+            }
+            noc_async_read_barrier();
+            cb_push_back(cb_id_in1, w_blk);
+        }
+        offset += batch_offset;
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_hw_interleaved.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_hw_interleaved.cpp
new file mode 100644
index 00000000000..be8d1eea137
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_hw_interleaved.cpp
@@ -0,0 +1,100 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+    uint32_t nc1        = get_arg_val<uint32_t>(12); // if 1 we expect the bcast tensor to have NC=1 and wrap around in NC
+
+    #ifndef IN0_SHARDED
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    #endif
+
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat in1_data_format = get_dataformat(cb_id_in1);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i1 = 0;
+
+    #ifndef IN0_SHARDED
+    uint32_t i = 0;
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+    #else
+        cb_reserve_back(cb_id_in0, num_tiles);
+        cb_push_back(cb_id_in0, num_tiles);
+    #endif
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = in1_tile_bytes,
+        .data_format = in1_data_format
+    };
+
+    #ifdef BCAST_SCALAR
+    cb_reserve_back(cb_id_in1, onetile);
+    l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+    noc_async_read_tile(i1, s1, l1_write_addr_in1);
+    noc_async_read_barrier();
+    cb_push_back(cb_id_in1, onetile);
+    #endif
+
+    for (uint32_t nc = 0; nc < NC; nc++) {
+        for (uint32_t ht = 0; ht < Ht; ht++) {
+            for (uint32_t wt = 0; wt < Wt; wt++) {
+                #ifndef IN0_SHARDED
+                cb_reserve_back(cb_id_in0, onetile);
+                l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                noc_async_read_tile(i, s0, l1_write_addr_in0);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, onetile);
+                i++; // input tile iterates over NC Ht Wt
+                #endif
+
+                #ifndef BCAST_SCALAR
+                // for each H,W-tile of the first tensor we push one tile from the second arg tile list
+                // but we don't advance the second tile index for H,W
+                cb_reserve_back(cb_id_in1, onetile);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read_tile(i1, s1, l1_write_addr_in1);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                #endif
+            } // wt loop
+        } // ht loop
+        #ifndef BCAST_SCALAR
+        if (nc1 == 0) {
+            i1 ++; // bcast-HW tile iterates only for nc loop and only if NC>1
+        }
+        #endif
+    } // nc loop
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_hw_interleaved_partitioned.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_hw_interleaved_partitioned.cpp
new file mode 100644
index 00000000000..d7592b98230
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_hw_interleaved_partitioned.cpp
@@ -0,0 +1,94 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr          = get_arg_val<uint32_t>(0);
+    uint32_t src1_addr          = get_arg_val<uint32_t>(1);
+    uint32_t num_tiles          = get_arg_val<uint32_t>(2);
+    uint32_t HtWt               = get_arg_val<uint32_t>(3);
+    uint32_t base_start_id_HtWt = get_arg_val<uint32_t>(4);
+    uint32_t curr_id_from_base  = get_arg_val<uint32_t>(5);
+    uint32_t bcast_id           = get_arg_val<uint32_t>(6);
+
+    #ifndef  IN0_SHARDED
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    #endif
+
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat in1_data_format = get_dataformat(cb_id_in1);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    #ifndef IN0_SHARDED
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+    #else
+        cb_reserve_back(cb_id_in0, num_tiles);
+        cb_push_back(cb_id_in0, num_tiles);
+    #endif
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = in1_tile_bytes,
+        .data_format = in1_data_format
+    };
+
+    #ifdef BCAST_SCALAR
+    cb_reserve_back(cb_id_in1, onetile);
+    l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+    noc_async_read_tile(bcast_id, s1, l1_write_addr_in1);
+    noc_async_read_barrier();
+    cb_push_back(cb_id_in1, onetile);
+    #endif
+
+    for (uint32_t i = 0; i < num_tiles; i++) {
+        uint32_t curr_id = base_start_id_HtWt + curr_id_from_base;
+
+        #ifndef IN0_SHARDED
+        cb_reserve_back(cb_id_in0, onetile);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        noc_async_read_tile(curr_id, s0, l1_write_addr_in0);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in0, onetile);
+        #endif
+
+        curr_id_from_base++;
+
+        #ifndef BCAST_SCALAR
+        cb_reserve_back(cb_id_in1, onetile);
+        l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+        noc_async_read_tile(bcast_id, s1, l1_write_addr_in1);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in1, onetile);
+
+        if (curr_id_from_base == HtWt) {
+            bcast_id++;
+        #else
+        if (curr_id_from_base == HtWt) {
+        #endif
+            base_start_id_HtWt += HtWt;
+            curr_id_from_base = 0;
+        }
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
new file mode 100644
index 00000000000..b3f4f03852d
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+
+
+
+void kernel_main() {
+    auto src0_addr          = get_arg_val<uint32_t>(0);
+    auto packed_scalar      = get_arg_val<uint32_t>(1);
+    auto num_tiles          = get_arg_val<uint32_t>(2);
+    auto HtWt               = get_arg_val<uint32_t>(3);
+    auto base_start_id_HtWt = get_arg_val<uint32_t>(4);
+    auto curr_id_from_base  = get_arg_val<uint32_t>(5);
+    auto bcast_id           = get_arg_val<uint32_t>(6);
+
+    #ifndef  IN0_SHARDED
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    #endif
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const DataFormat in1_data_format = DataFormat::Float16_b;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    #ifndef IN0_SHARDED
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+    #else
+        cb_reserve_back(cb_id_in0, num_tiles);
+        cb_push_back(cb_id_in0, num_tiles);
+    #endif
+
+    generate_bcast_unary_scalar(cb_id_in1, packed_scalar);
+
+    for (uint32_t i = 0; i < num_tiles; i++) {
+        uint32_t curr_id = base_start_id_HtWt + curr_id_from_base;
+
+        #ifndef IN0_SHARDED
+        cb_reserve_back(cb_id_in0, onetile);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        noc_async_read_tile(curr_id, s0, l1_write_addr_in0);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in0, onetile);
+        #endif
+
+        curr_id_from_base++;
+
+        if (curr_id_from_base == HtWt) {
+            base_start_id_HtWt += HtWt;
+            curr_id_from_base = 0;
+        }
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_w_interleaved.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_w_interleaved.cpp
new file mode 100644
index 00000000000..fe9e3afcda3
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_w_interleaved.cpp
@@ -0,0 +1,81 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    // skip args 1,2,5,6,7 for compat with single-bank readers and reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+    uint32_t nc1        = get_arg_val<uint32_t>(12); // if 1 we expect the bcast tensor to have NC=1
+
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat in1_data_format = get_dataformat(cb_id_in1);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i = 0;
+    uint32_t i_bcast = 0;
+
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = in1_tile_bytes,
+        .data_format = in1_data_format
+    };
+
+    for (uint32_t nc = 0; nc < NC; nc ++ ) {
+        for (uint32_t ht = 0; ht < Ht; ht++ ) {
+            {
+                // only read one tile in H per W-line of tiles
+                // So we push a total of NC*H tiles from src1
+                cb_reserve_back(cb_id_in1, onetile);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read_tile(i_bcast, s1, l1_write_addr_in1);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                i_bcast++;
+            }
+
+            for (uint32_t wt = 0; wt < Wt; wt++) {
+                cb_reserve_back(cb_id_in0, onetile);
+                l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                noc_async_read_tile(i, s0, l1_write_addr_in0);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, onetile);
+                i++;
+            } // Wt loop
+        } // Ht loop
+
+        if (nc1) // if we also bcast from NC=1, go back Ht tiles on bcasted tensor
+            i_bcast -= Ht;
+    } // NC loop
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_w_interleaved_input_cols_partitioned.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_w_interleaved_input_cols_partitioned.cpp
new file mode 100644
index 00000000000..a1320bea08b
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_bcast_w_interleaved_input_cols_partitioned.cpp
@@ -0,0 +1,87 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    // skip args 1,2,5,6,7 for compat with single-bank readers and reader_diff_lengths
+    uint32_t NCHtWt     = get_arg_val<uint32_t>(8);
+    uint32_t NC         = get_arg_val<uint32_t>(9);
+    uint32_t Ht         = get_arg_val<uint32_t>(10);
+    uint32_t Wt         = get_arg_val<uint32_t>(11);
+    uint32_t nc1        = get_arg_val<uint32_t>(12); // if 1 we expect the bcast tensor to have NC=1
+    uint32_t start_id   = get_arg_val<uint32_t>(13);
+    uint32_t HtWt       = get_arg_val<uint32_t>(14); // HtWt of input tensor
+    uint32_t Wt_skip    = get_arg_val<uint32_t>(15);
+
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1;
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);
+    const DataFormat in1_data_format = get_dataformat(cb_id_in1);
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles;
+    uint32_t i = 0;
+    uint32_t i_bcast = 0;
+
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+
+    const InterleavedAddrGenFast<src1_is_dram> s1 = {
+        .bank_base_address = src1_addr,
+        .page_size = in1_tile_bytes,
+        .data_format = in1_data_format
+    };
+
+    uint32_t i_nc = 0;
+    for (uint32_t nc = 0; nc < NC; nc ++ ) {
+        i = i_nc + start_id;
+        for (uint32_t ht = 0; ht < Ht; ht++ ) {
+            {
+                // only read one tile in H per W-line of tiles
+                // So we push a total of NC*H tiles from src1
+                cb_reserve_back(cb_id_in1, onetile);
+                l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+                noc_async_read_tile(i_bcast, s1, l1_write_addr_in1);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in1, onetile);
+                i_bcast++;
+            }
+
+            for (uint32_t wt = 0; wt < Wt; wt++) {
+                cb_reserve_back(cb_id_in0, onetile);
+                l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+                noc_async_read_tile(i, s0, l1_write_addr_in0);
+                noc_async_read_barrier();
+                cb_push_back(cb_id_in0, onetile);
+                i++;
+            } // Wt loop
+            i += Wt_skip;
+        } // Ht loop
+        if (nc1) // if we also bcast from NC=1, go back Ht tiles on bcasted tensor
+            i_bcast -= Ht;
+        i_nc += HtWt;
+    } // NC loop
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp
index 54e534e271b..c2a47a8f3d5 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp
@@ -2,6 +2,10 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
 #include <stdint.h>
 #include "dataflow_api.h"
 
@@ -15,8 +19,8 @@ void kernel_main() {
     uint32_t block_width = get_arg_val<uint32_t>(5);
     uint32_t num_cores_y = get_arg_val<uint32_t>(6);
 
-    constexpr uint32_t cb_id_in0 = 0;
-    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1;
     constexpr bool block_or_width_sharded = get_compile_time_arg_val(2) == 1;
     #ifdef IN0_SHARDED
         cb_reserve_back(cb_id_in0, num_tiles);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/writer_unary_interleaved_input_cols_batched.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/writer_unary_interleaved_input_cols_batched.cpp
new file mode 100644
index 00000000000..9db3df428c2
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/writer_unary_interleaved_input_cols_batched.cpp
@@ -0,0 +1,57 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This code is temporarily copied from ttnn/cpp/ttnn/operations/datamovement/binary/device/ to demonstrate
+// the new ability to keep the CircularBufferConfigs continuous during dispatching.  See the use of CBIndex::c_2 below.
+// When broadcating is properly supported we expect this code to be deleted or refactored substantially.
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+
+    uint32_t dst_addr  = get_arg_val<uint32_t>(0);
+    uint32_t Ht = get_arg_val<uint32_t>(3); // Index 3 to match with regular writer_unary
+    uint32_t Wt = get_arg_val<uint32_t>(4);
+    uint32_t Wt_read = get_arg_val<uint32_t>(5);
+    uint32_t Wt_skip = get_arg_val<uint32_t>(6);
+    uint32_t NC = get_arg_val<uint32_t>(7);
+    uint32_t HtWt = get_arg_val<uint32_t>(8); // HtWt of input tensor
+
+    constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
+
+    constexpr uint32_t cb_id_out0 = tt::CBIndex::c_2;
+
+    // single-tile ublocks
+    constexpr uint32_t onetile = 1;
+    const uint32_t tile_bytes = get_tile_size(cb_id_out0);
+    const DataFormat data_format = get_dataformat(cb_id_out0);
+
+    const InterleavedAddrGenFast<dst_is_dram> s = {
+        .bank_base_address = dst_addr,
+        .page_size = tile_bytes,
+        .data_format = data_format
+    };
+
+    uint32_t tile_id = 0;
+    uint32_t i_nc = 0;
+    for (uint32_t nc = 0; nc < NC; nc++) {
+        tile_id = i_nc + Wt_read;
+        for (uint32_t i = 0; i < Ht; i++) {
+            for (uint32_t j = 0; j < Wt; j++) {
+                cb_wait_front(cb_id_out0, onetile);
+                uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+
+                noc_async_write_tile(tile_id, s, l1_read_addr);
+
+                noc_async_write_barrier();
+
+                cb_pop_front(cb_id_out0, onetile);
+
+                tile_id++;
+            }
+            tile_id += Wt_skip;
+        }
+        i_nc += HtWt;
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
index bed48d6a143..b1f77cb1b52 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
@@ -7,6 +7,8 @@
 #include "tt_metal/common/assert.hpp"
 #include "ttnn/cpp/ttnn/tensor/types.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::unary::utils {
 
 namespace {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp
new file mode 100644
index 00000000000..b1b4e8d299d
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/common.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/eltwise_unary/eltwise_unary.h"
+#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
+    uint32_t per_core_block_dim = get_compile_time_arg_val(1);
+
+    init_sfpu(tt::CBIndex::c_0, tt::CBIndex::c_2);
+    for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) {
+        cb_reserve_back(tt::CBIndex::c_2, per_core_block_dim);
+        for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
+            tile_regs_acquire();
+
+            // Pop tile after tile, copy to DST and pack
+            cb_wait_front(tt::CBIndex::c_0, 1);
+
+            copy_tile(tt::CBIndex::c_0, 0, 0);
+
+            #ifdef SFPU_OP_CHAIN_0
+            SFPU_OP_CHAIN_0
+            #endif
+
+            tile_regs_commit();
+
+            tile_regs_wait();
+
+            pack_tile(0, tt::CBIndex::c_2);
+
+            cb_pop_front(tt::CBIndex::c_0, 1);
+
+            tile_regs_release();
+        }
+        cb_push_back(tt::CBIndex::c_2, per_core_block_dim);
+    }
+
+}
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
index 5395587302d..980a1f97bab 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.cpp
@@ -543,11 +543,9 @@ Tensor _selu(const Tensor& x, const float scale, const float alpha, const std::o
 }
 
 // threshold(a,t,v) = (a <= t)*v + (a > t)*a
-Tensor _threshold(const Tensor& input_tensor, float threshold, float value, const std::optional<MemoryConfig>& output_mem_config) {
-    Tensor t0 = ttnn::subtract(input_tensor, threshold, std::nullopt, output_mem_config);
-    Tensor t1 = ttnn::multiply(ttnn::lez(t0), value, std::nullopt, output_mem_config);
-    Tensor t2 = ttnn::multiply(ttnn::gtz(t0, output_mem_config), input_tensor, std::nullopt, output_mem_config);
-    return ttnn::add(t1, t2, std::nullopt, output_mem_config);
+Tensor ExecuteUnaryCompositeThreshold::invoke(const Tensor& input_tensor, float threshold, float value, const std::optional<MemoryConfig>& output_mem_config) {
+    Tensor sub_result = ttnn::subtract(input_tensor, threshold, std::nullopt, output_mem_config);
+    return ttnn::where(ttnn::lez(sub_result), value, input_tensor, output_mem_config);
 }
 
 std::vector<Tensor> split_tensor_for_glu(const Tensor& input_a, int32_t dim, const std::optional<MemoryConfig>& output_mem_config) {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp
index 8194a669e76..95d5eaa7614 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_composite_op.hpp
@@ -37,7 +37,6 @@ enum class UnaryCompositeOpType {
     HARDSIGMOID,
     HARDTANH,
     SELU,
-    THRESHOLD,
     GLU,
     REGLU,
     GEGLU,
@@ -82,7 +81,6 @@ Tensor _hardswish(const Tensor&, float scale =  1.0f/6.0f, float shift = 0.5f, c
 Tensor _hardsigmoid(const Tensor&, float scale =  1.0f/6.0f, float shift = 0.5f, const std::optional<MemoryConfig>& output_mem_config = std::nullopt);
 Tensor _hardtanh(const Tensor&, float min = -1, float max = 1, const std::optional<MemoryConfig>& output_mem_config = std::nullopt);
 Tensor _selu(const Tensor&, float scale = 1.0507, float alpha = 1.67326, const std::optional<MemoryConfig>& output_mem_config = std::nullopt);
-Tensor _threshold(const Tensor&, float, float, const std::optional<MemoryConfig>& );
 Tensor _glu(const Tensor&, int32_t, const std::optional<MemoryConfig>& );
 Tensor _reglu(const Tensor&, int32_t, const std::optional<MemoryConfig>& );
 Tensor _geglu(const Tensor&, int32_t, const std::optional<MemoryConfig>& );
@@ -267,13 +265,6 @@ struct OpHandler<UnaryCompositeOpType::SELU> {
     }
 };
 
-template <>
-struct OpHandler<UnaryCompositeOpType::THRESHOLD> {
-    static Tensor handle(const Tensor& t1, float threshold, float value, const std::optional<MemoryConfig>& mem_cfg ) {
-        return _threshold(t1, threshold, value, mem_cfg);
-    }
-};
-
 //glu (geglu, reglu, swiglu, glu) varinats are supported only for last dimension.
 template <>
 struct OpHandler<UnaryCompositeOpType::GLU> {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
index 3f611e54bd2..79995d76d33 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_device_operation.cpp
@@ -11,6 +11,8 @@
 #include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
 #include "ttnn/tensor/tensor_utils.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::unary {
 
 namespace {
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
index ab8166c1f4c..bd9a7a976de 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_program_factory.cpp
@@ -44,14 +44,14 @@ UnaryProgramFactory::cached_program_t UnaryProgramFactory::create(
     auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
         tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tiles);
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_src0_config =
         tt::tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
             .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     uint32_t num_output_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_output_config =
         tt::tt_metal::CircularBufferConfig(
@@ -95,7 +95,7 @@ UnaryProgramFactory::cached_program_t UnaryProgramFactory::create(
     std::map<string, string> unary_defines = utils::get_block_defines(args.op_chain);
     auto eltwise_unary_kernel_group_1_id = tt::tt_metal::CreateKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
         core_group_1,
         tt::tt_metal::ComputeConfig{
             .math_fidelity = MathFidelity::HiFi4,
@@ -114,7 +114,7 @@ UnaryProgramFactory::cached_program_t UnaryProgramFactory::create(
 
         auto eltwise_unary_kernel_group_2_id = tt::tt_metal::CreateKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+            "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
             core_group_2,
             tt::tt_metal::ComputeConfig{
                 .math_fidelity = MathFidelity::HiFi4,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
index b693504d98a..38fc3cdfbc8 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/unary_sharded_program_factory.cpp
@@ -65,7 +65,7 @@ UnaryShardedProgramFactory::cached_program_t UnaryShardedProgramFactory::create(
         num_tile_per_core = (shard_size_in_bytes + input_tile_size - 1) / input_tile_size;  // ceil value
     }
 
-    uint32_t in_cb_id = tt::CB::c_in0;
+    uint32_t in_cb_id = tt::CBIndex::c_0;
     uint32_t buffering_factor = 1;  // data is already fully buffered in the CBs since its sharded
     uint32_t aligned_input_tile_nbytes =
         round_up_to_mul32(input_tile_size);  // will have issue if the page is not multiple of 32
@@ -79,7 +79,7 @@ UnaryShardedProgramFactory::cached_program_t UnaryShardedProgramFactory::create(
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
     // output sharded CB
-    uint32_t out_cb_id = tt::CB::c_out0;
+    uint32_t out_cb_id = tt::CBIndex::c_2;
     tt::tt_metal::CircularBufferConfig out_cb_config = tt::tt_metal::CircularBufferConfig(
                                             in_cb_pagesize * in_cb_npages,
                                             {{out_cb_id, out_df}})
@@ -124,7 +124,7 @@ UnaryShardedProgramFactory::cached_program_t UnaryShardedProgramFactory::create(
     std::map<string, string> unary_defines = utils::get_block_defines(args.op_chain);
     auto eltwise_unary_kernel_group_1_id = tt::tt_metal::CreateKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
         all_cores,
         tt::tt_metal::ComputeConfig{
             .math_fidelity = MathFidelity::HiFi4,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp
index 2532bfcf36b..3c4cb7ba7ca 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_composite.hpp
@@ -129,6 +129,14 @@ struct ExecuteUnaryCompositeClamp {
         const std::optional<MemoryConfig> &memory_config = std::nullopt);
 };
 
+struct ExecuteUnaryCompositeThreshold {
+    static Tensor invoke(
+        const Tensor &input_tensor,
+        float threshold,
+        float value,
+        const std::optional<MemoryConfig> &memory_config = std::nullopt);
+};
+
 struct ExecuteUnaryCompositeClip {
     static Tensor invoke(
         const Tensor &input_tensor,
@@ -305,8 +313,7 @@ constexpr auto selu = ttnn::register_operation_with_auto_launch_op<
     operations::unary::ExecuteUnaryCompositeOpWithFloats<operations::unary::UnaryCompositeOpType::SELU>>();
 constexpr auto threshold = ttnn::register_operation_with_auto_launch_op<
     "ttnn::threshold",
-    operations::unary::ExecuteUnaryCompositeOpWithFloats<operations::unary::UnaryCompositeOpType::THRESHOLD>>();
-
+    operations::unary::ExecuteUnaryCompositeThreshold>();
 constexpr auto glu = ttnn::register_operation_with_auto_launch_op<
     "ttnn::glu",
     operations::unary::ExecuteUnaryCompositeOpWithDim<operations::unary::UnaryCompositeOpType::GLU>>();
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
index ece1f409400..f22fb9008f3 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary/unary_pybind.hpp
@@ -44,8 +44,8 @@ void bind_unary_composite_optional_floats_with_default(
             input_tensor (ttnn.Tensor): the input tensor.
 
         Keyword args:
-            {2} (float or ttnn.Tensor): {3}. Defaults to `{4}`.
-            {5} (float or ttnn.Tensor): {6}. Defaults to `{7}`.
+            {2} (float or ttnn.Tensor): {3}. Defaults to `None`.
+            {5} (float or ttnn.Tensor): {6}. Defaults to `None`.
             memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`.
 
         Returns:
@@ -67,8 +67,13 @@ void bind_unary_composite_optional_floats_with_default(
             {10}
 
         Example:
-            >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
-            >>> output = {1}(tensor, 5.0, 7.0)
+            >>> input_tensor = ttnn.from_torch(torch.tensor([[1, 2], [3,4]], dtype=torch.bfloat16), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+            >>> min_tensor = ttnn.from_torch(torch.tensor([[0, 2], [0,4]], dtype=torch.bfloat16), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+            >>> max_tensor = ttnn.from_torch(torch.tensor([[1, 2], [3,4]], dtype=torch.bfloat16), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+            >>> output = {1}(input_tensor, min_tensor, max_tensor)
+
+            >>> input_tensor = ttnn.from_torch(torch.tensor([[1, 2], [3,4]], dtype=torch.bfloat16), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+            >>> output = {1}(input_tensor, min = 2, max = 9)
         )doc",
         operation.base_name(),
         operation.python_fully_qualified_name(),
@@ -351,7 +356,7 @@ void bind_unary_operation_with_fast_and_approximate_mode(py::module& module, con
             input_tensor (ttnn.Tensor): the input tensor.
 
         Keyword Args:
-            fast_and_approximate_mode (bool): Use the fast and approximate mode.
+            fast_and_approximate_mode (bool, optional): Use the fast and approximate mode. Defaults to `False`.
             memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`.
             output_tensor (ttnn.Tensor, optional): preallocated output tensor. Defaults to `None`.
             queue_id (int, optional): command queue id. Defaults to `0`.
@@ -375,8 +380,8 @@ void bind_unary_operation_with_fast_and_approximate_mode(py::module& module, con
             {3}
 
         Example:
-            >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
-            >>> output = {1}(tensor, fast_and_approximate_mode=true)
+            >>> tensor = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+            >>> output = {1}(tensor, fast_and_approximate_mode=True)
         )doc",
         operation.base_name(),
         operation.python_fully_qualified_name(),
@@ -1113,7 +1118,7 @@ void bind_unary_composite_floats_with_default(
     const std::string& parameter_name_b,
     const std::string& parameter_b_doc,
     float parameter_b_value,
-    const std::string& supported_dtype = "BFLOAT16",
+    const std::string& supported_dtype = "BFLOAT16, BFLOAT8_B",
     const std::string& info_doc = "") {
     auto doc = fmt::format(
         R"doc(
@@ -1123,8 +1128,8 @@ void bind_unary_composite_floats_with_default(
             input_tensor (ttnn.Tensor): the input tensor.
 
         Keyword args:
-            {2} (float): {3}. Defaults to `{4}`.
-            {5} (float): {6}. Defaults to `{7}`.
+            {2} (float, optional): {3}. Defaults to `{4}`.
+            {5} (float, optional): {6}. Defaults to `{7}`.
             memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`.
 
         Returns:
@@ -1146,7 +1151,7 @@ void bind_unary_composite_floats_with_default(
             {9}
 
         Example:
-            >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
+            >>> tensor = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
             >>> output = {1}(tensor, {2} = {4}, {5} = {7})
         )doc",
         operation.base_name(),
@@ -1264,17 +1269,30 @@ void bind_unary_composite_int(py::module& module, const unary_operation_t& opera
 
         Args:
             input_tensor (ttnn.Tensor): the input tensor.
+            {2} (int): {3}.
 
         Keyword args:
-            {2} (int): {3}.
             memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`.
 
         Returns:
             ttnn.Tensor: the output tensor.
 
+        Note:
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - BFLOAT16
+                 - TILE
+                 - 2, 3, 4
+
         Example:
-            >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
-            >>> output = {1}(tensor, {2})
+            >>> tensor = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+            >>> output = {1}(tensor, 3)
         )doc",
         operation.base_name(),
         operation.python_fully_qualified_name(),
@@ -1301,7 +1319,7 @@ void bind_unary_composite_int(py::module& module, const unary_operation_t& opera
 
 //OpHandler_two_float_with_default
 template <typename unary_operation_t>
-void bind_unary_composite_floats(
+void bind_unary_composite_threshold(
     py::module& module,
     const unary_operation_t& operation,
     const std::string& parameter_name_a,
@@ -1324,8 +1342,23 @@ void bind_unary_composite_floats(
         Returns:
             ttnn.Tensor: the output tensor.
 
+        Note:
+            Supported dtypes, layouts, and ranks:
+
+            .. list-table::
+               :header-rows: 1
+
+               * - Dtypes
+                 - Layouts
+                 - Ranks
+               * - BFLOAT16
+                 - TILE
+                 - 2, 3, 4
+
         Example:
-            >>> tensor = ttnn.from_torch(torch.tensor((1, 2), dtype=torch.bfloat16), device=device)
+            >>> tensor = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+            >>> {2} = 1.0
+            >>> {4} = 10.0
             >>> output = {1}(tensor, {2}, {4})
         )doc",
         operation.base_name(),
@@ -1957,11 +1990,11 @@ void py_module(py::module& module) {
         ttnn::selu,
         "scale", "Scale value", 1.0507,
         "alpha", "Alpha value", 1.67326);
-    detail::bind_unary_composite_floats(
+    detail::bind_unary_composite_threshold(
         module,
         ttnn::threshold,
         "threshold", "Threshold value",
-        "value", "Value value",
+        "value", "Replacing value",
         R"doc(Performs threshold function on :attr:`input_tensor`, :attr:`threshold`, :attr:`value`.)doc");
     detail::bind_unary_composite_int_with_default(
         module,
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp
index 8df7ba78e45..a4728c4fe01 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.cpp
@@ -134,12 +134,12 @@ std::vector<Tensor> ExecuteUnaryBackwardSoftplus::invoke(
 }
 
 std::vector<Tensor> ExecuteUnaryBackwardRdiv::invoke(
-    const Tensor& grad, const Tensor& input, float scalar, string round_mode, const std::optional<MemoryConfig>& output_mem_config) {
+    const Tensor& grad, const Tensor& input, float scalar, const std::optional<string> round_mode, const std::optional<MemoryConfig>& output_mem_config) {
     std::vector<Tensor> grad_tensor;
-    TT_FATAL((round_mode == "None" || round_mode == "trunc" || round_mode == "floor"), "Incorrect rounding mode (expected 'None', 'trunc', or 'floor')");
+    TT_FATAL((round_mode == std::nullopt || round_mode == "trunc" || round_mode == "floor"), "Incorrect rounding mode (expected None, 'trunc', or 'floor')");
     float t_nan = std::nanf("");
     float t_inf = std::numeric_limits<float>::infinity();
-    if (round_mode == "None") {
+    if (round_mode == std::nullopt) {
         Tensor result = ttnn::where(
             ttnn::nez(input),
             ttnn::multiply(ttnn::neg(grad, output_mem_config),
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp
index 229667b3ae0..6aae889cd31 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward.hpp
@@ -488,7 +488,7 @@ struct ExecuteUnaryBackwardRdiv {
         const Tensor &grad_tensor_arg,
         const Tensor &input_tensor_arg,
         float parameter_a,
-        string parameter_b,
+        const std::optional<string> parameter_b = std::nullopt,
         const std::optional<MemoryConfig> &memory_config = std::nullopt);
 };
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp
index cbd2d9e2f8c..6c5be4ed4cb 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/unary_backward/unary_backward_pybind.hpp
@@ -723,7 +723,7 @@ void bind_unary_backward_optional_float_params_with_default(
 }
 
 template <typename unary_backward_operation_t>
-void bind_unary_backward_float_string_default(
+void bind_unary_backward_rdiv(
     py::module& module,
     const unary_backward_operation_t& operation,
     const std::string& parameter_name_a,
@@ -744,7 +744,7 @@ void bind_unary_backward_float_string_default(
             {2} (float): {3}.
 
         Keyword args:
-            {4} (string, optional): {5}. Defaults to `{6}`.
+            {4} (string, optional): {5}. Defaults to None.
             memory_config (ttnn.MemoryConfig, optional): memory configuration for the operation. Defaults to `None`.
 
         Returns:
@@ -770,7 +770,7 @@ void bind_unary_backward_float_string_default(
             >>> grad_tensor = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16), layout=ttnn.TILE_LAYOUT, device=device)
             >>> input = ttnn.from_torch(torch.tensor([[1, 2], [3, 4]], dtype=torch.bfloat16, requires_grad=True), layout=ttnn.TILE_LAYOUT, device=device)
             >>> {2} = 0.5
-            >>> output = {1}(grad_tensor, input, {2}, {4} = {6})
+            >>> output = {1}(grad_tensor, input, {2}, {4} = None)
         )doc",
         operation.base_name(),
         operation.python_fully_qualified_name(),
@@ -792,7 +792,7 @@ void bind_unary_backward_float_string_default(
                const ttnn::Tensor& grad_tensor,
                const ttnn::Tensor& input_tensor,
                float parameter_a,
-               string parameter_b,
+               const std::optional<string> parameter_b,
                const std::optional<MemoryConfig>& memory_config) {
                 return self(grad_tensor, input_tensor, parameter_a, parameter_b, memory_config);
             },
@@ -800,7 +800,7 @@ void bind_unary_backward_float_string_default(
             py::arg("input_tensor"),
             py::arg(parameter_name_a.c_str()),
             py::kw_only(),
-            py::arg(parameter_name_b.c_str()) = parameter_b_value,
+            py::arg(parameter_name_b.c_str()) = std::nullopt,
             py::arg("memory_config") = std::nullopt});
 }
 
@@ -1353,7 +1353,7 @@ void py_module(py::module& module) {
         20.0,
         R"doc(Performs backward operations for softplus on :attr:`input_tensor`, :attr:`beta`, :attr:`threshold` with given :attr:`grad_tensor`.)doc");
 
-    detail::bind_unary_backward_float_string_default(
+    detail::bind_unary_backward_rdiv(
         module,
         ttnn::rdiv_bw,
         "scalar",
diff --git a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
index b925b9bf8e5..c722efc5ef5 100644
--- a/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/embedding/device/embedding_program_factory.hpp
@@ -125,7 +125,7 @@ operation::ProgramWithCallbacks embeddings_tilized(
         auto cb_src2 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src2_config);
     }
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = CBIndex::c_16;
     tt_metal::CircularBufferConfig cb_output_config =
         tt_metal::CircularBufferConfig(
             buffering * num_tiles_per_block * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
index 7b2538c6ede..83842b065f9 100644
--- a/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding_backward/device/embedding_backward_program_factory.cpp
@@ -12,6 +12,7 @@
 
 using namespace tt;
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::embedding_backward::detail {
 
@@ -80,23 +81,23 @@ operation::ProgramWithCallbacks embedding_backward_multi_core(
     ////////////////////////////////////////////////////////////////////////////
 
     // To read from grad tensor
-    create_cb(CB::c_in0, program, all_cores, grad_single_tile_size, max_tiles_per_core, grad_cb_data_format);
+    create_cb(CBIndex::c_0, program, all_cores, grad_single_tile_size, max_tiles_per_core, grad_cb_data_format);
 
     // To store index values for a single tile
-    create_cb(CB::c_in1, program, all_cores, index_single_page_size, 1, index_cb_data_format);
+    create_cb(CBIndex::c_1, program, all_cores, index_single_page_size, 1, index_cb_data_format);
 
     // To read from output tensor
-    create_cb(CB::c_in2, program, all_cores, output_single_tile_size, max_tiles_per_core, output_cb_data_format);
+    create_cb(CBIndex::c_2, program, all_cores, output_single_tile_size, max_tiles_per_core, output_cb_data_format);
 
     // To store mask values for a single tile
-    create_cb(CB::c_intermed0, program, all_cores, mask_single_page_size, 1, mask_cb_data_format);
+    create_cb(CBIndex::c_24, program, all_cores, mask_single_page_size, 1, mask_cb_data_format);
 
     // L1 scratch space to pass chunk_count from reader to UNPACK
     create_cb(
-        CB::c_intermed1, program, all_cores, 16, 1, grad_cb_data_format);  // grad_cb_data_format doesn't matter here
+        CBIndex::c_25, program, all_cores, 16, 1, grad_cb_data_format);  // grad_cb_data_format doesn't matter here
 
     // For tiles to be written to the output
-    create_cb(CB::c_out0, program, all_cores, output_single_tile_size, max_tiles_per_core, output_cb_data_format);
+    create_cb(CBIndex::c_16, program, all_cores, output_single_tile_size, max_tiles_per_core, output_cb_data_format);
 
     ////////////////////////////////////////////////////////////////////////////
     //                 Kernels
diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/device/kernels/compute/embedding_backward.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/device/kernels/compute/embedding_backward.cpp
index c5720ddde99..9bd20bb239a 100644
--- a/ttnn/cpp/ttnn/operations/embedding_backward/device/kernels/compute/embedding_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding_backward/device/kernels/compute/embedding_backward.cpp
@@ -13,12 +13,12 @@ void MAIN {
     constexpr uint32_t max_tiles_per_core = get_compile_time_arg_val(0);
     constexpr uint32_t input_height = get_compile_time_arg_val(1);
 
-    constexpr uint32_t cb_grad = tt::CB::c_in0;
-    constexpr uint32_t cb_index = tt::CB::c_in1;
-    constexpr uint32_t cb_out_intermed = tt::CB::c_in2;
-    constexpr uint32_t cb_mask = tt::CB::c_intermed0;
-    constexpr uint32_t cb_chunk_count_scratch = tt::CB::c_intermed1;
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_grad = tt::CBIndex::c_0;
+    constexpr uint32_t cb_index = tt::CBIndex::c_1;
+    constexpr uint32_t cb_out_intermed = tt::CBIndex::c_2;
+    constexpr uint32_t cb_mask = tt::CBIndex::c_24;
+    constexpr uint32_t cb_chunk_count_scratch = tt::CBIndex::c_25;
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
 
     unary_op_init_common(cb_grad);
 
diff --git a/ttnn/cpp/ttnn/operations/embedding_backward/device/kernels/dataflow/reader_embedding_backward.cpp b/ttnn/cpp/ttnn/operations/embedding_backward/device/kernels/dataflow/reader_embedding_backward.cpp
index 90cb4b6da09..433bced24c8 100644
--- a/ttnn/cpp/ttnn/operations/embedding_backward/device/kernels/dataflow/reader_embedding_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/embedding_backward/device/kernels/dataflow/reader_embedding_backward.cpp
@@ -116,12 +116,12 @@ void kernel_main() {
     constexpr uint32_t seq_len_tiles = get_compile_time_arg_val(10);
     constexpr uint32_t num_embeddings = get_compile_time_arg_val(11);
 
-    constexpr uint32_t cb_grad = tt::CB::c_in0;
-    constexpr uint32_t cb_index = tt::CB::c_in1;
-    constexpr uint32_t cb_out_intermed = tt::CB::c_in2;
-    constexpr uint32_t cb_mask = tt::CB::c_intermed0;
-    constexpr uint32_t cb_chunk_count_scratch = tt::CB::c_intermed1;
-    constexpr uint32_t cb_id_out0 = tt::CB::c_out0;
+    constexpr uint32_t cb_grad = tt::CBIndex::c_0;
+    constexpr uint32_t cb_index = tt::CBIndex::c_1;
+    constexpr uint32_t cb_out_intermed = tt::CBIndex::c_2;
+    constexpr uint32_t cb_mask = tt::CBIndex::c_24;
+    constexpr uint32_t cb_chunk_count_scratch = tt::CBIndex::c_25;
+    constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16;
 
     constexpr uint32_t grad_page_size = get_tile_size(cb_grad);
     constexpr uint32_t out_page_size = get_tile_size(cb_id_out0);
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/kernels/compute/eltwise_sfpu.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/compute/eltwise_sfpu.cpp
new file mode 100644
index 00000000000..b1b4e8d299d
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/compute/eltwise_sfpu.cpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include "compute_kernel_api/common.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "compute_kernel_api/eltwise_unary/eltwise_unary.h"
+#include "compute_kernel_api/eltwise_unary/sfpu_split_includes.h"
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
+    uint32_t per_core_block_dim = get_compile_time_arg_val(1);
+
+    init_sfpu(tt::CBIndex::c_0, tt::CBIndex::c_2);
+    for (uint32_t block_index = 0; block_index < per_core_block_cnt; block_index++) {
+        cb_reserve_back(tt::CBIndex::c_2, per_core_block_dim);
+        for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
+            tile_regs_acquire();
+
+            // Pop tile after tile, copy to DST and pack
+            cb_wait_front(tt::CBIndex::c_0, 1);
+
+            copy_tile(tt::CBIndex::c_0, 0, 0);
+
+            #ifdef SFPU_OP_CHAIN_0
+            SFPU_OP_CHAIN_0
+            #endif
+
+            tile_regs_commit();
+
+            tile_regs_wait();
+
+            pack_tile(0, tt::CBIndex::c_2);
+
+            cb_pop_front(tt::CBIndex::c_0, 1);
+
+            tile_regs_release();
+        }
+        cb_push_back(tt::CBIndex::c_2, per_core_block_dim);
+    }
+
+}
+}
diff --git a/tests/tt_metal/tt_metal/unit_tests_frequent/tests_main.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/blank.cpp
similarity index 79%
rename from tests/tt_metal/tt_metal/unit_tests_frequent/tests_main.cpp
rename to ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/blank.cpp
index 1e42f41a46c..04ba7e3c561 100644
--- a/tests/tt_metal/tt_metal/unit_tests_frequent/tests_main.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/blank.cpp
@@ -2,4 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "gtest/gtest.h"
+
+void kernel_main() {
+
+}
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/reader_binary_diff_lengths.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/reader_binary_diff_lengths.cpp
new file mode 100644
index 00000000000..6462b737b3d
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/reader_binary_diff_lengths.cpp
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src0_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src0_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src0_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t src0_num_tiles  = get_arg_val<uint32_t>(3);
+    uint32_t src1_addr  = get_arg_val<uint32_t>(4);
+    uint32_t src1_noc_x = get_arg_val<uint32_t>(5);
+    uint32_t src1_noc_y = get_arg_val<uint32_t>(6);
+    uint32_t src1_num_tiles  = get_arg_val<uint32_t>(7);
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0);
+    uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1);
+    uint32_t ublock_size_tiles = 1;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    uint32_t num_tiles = src0_num_tiles > src1_num_tiles ? src0_num_tiles : src1_num_tiles;
+
+    // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker)
+    for (uint32_t i=0; i<num_tiles; i += ublock_size_tiles) {
+        if (i < src0_num_tiles) {
+            uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr);
+
+            cb_reserve_back(cb_id_in0, ublock_size_tiles);
+            l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+
+            noc_async_read(src0_noc_addr, l1_write_addr_in0, ublock_size_bytes_0);
+
+            noc_async_read_barrier();
+
+            cb_push_back(cb_id_in0, ublock_size_tiles);
+
+            src0_addr += ublock_size_bytes_0;
+        }
+
+        if (i < src1_num_tiles) {
+            uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr);
+
+            cb_reserve_back(cb_id_in1, ublock_size_tiles);
+            l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+
+            noc_async_read(src1_noc_addr, l1_write_addr_in1, ublock_size_bytes_1);
+
+            noc_async_read_barrier();
+
+            cb_push_back(cb_id_in1, ublock_size_tiles);
+
+            src1_addr += ublock_size_bytes_1;
+        }
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/reader_unary.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/reader_unary.cpp
new file mode 100644
index 00000000000..bf5b77d8f60
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/reader_unary.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t src_addr  = get_arg_val<uint32_t>(0);
+    uint32_t src_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t src_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t cb_id_in0 = 0;
+
+    // ublocks size defined in tiles
+    constexpr uint32_t ublock_size_tiles = 1;
+    uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles;
+
+    // read a ublock of tiles from src to CB, and then push the ublock to unpacker
+    for (uint32_t i = 0; i<num_tiles; i += ublock_size_tiles) {
+        uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
+
+        cb_reserve_back(cb_id_in0, ublock_size_tiles);
+        uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
+        noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes);
+
+        noc_async_read_barrier();
+
+        cb_push_back(cb_id_in0, ublock_size_tiles);
+        src_addr += ublock_size_bytes;
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/writer_unary.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/writer_unary.cpp
new file mode 100644
index 00000000000..e55cfa41a37
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/kernels/dataflow/writer_unary.cpp
@@ -0,0 +1,31 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    uint32_t dst_addr  = get_arg_val<uint32_t>(0);
+    uint32_t dst_noc_x = get_arg_val<uint32_t>(1);
+    uint32_t dst_noc_y = get_arg_val<uint32_t>(2);
+    uint32_t num_tiles = get_arg_val<uint32_t>(3);
+
+    constexpr uint32_t cb_id_out0 = tt::CBIndex::c_2;
+
+    // single-tile ublocks
+    uint32_t ublock_size_bytes = get_tile_size(cb_id_out0);
+    uint32_t ublock_size_tiles = 1;
+
+    for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
+        uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr);
+
+        cb_wait_front(cb_id_out0, ublock_size_tiles);
+        uint32_t l1_read_addr = get_read_ptr(cb_id_out0);
+        noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes);
+
+        noc_async_write_barrier();
+
+        cb_pop_front(cb_id_out0, ublock_size_tiles);
+        dst_addr += ublock_size_bytes;
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
index 8aae580595d..e8ec9f59fba 100644
--- a/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/multi_core_program_factory.cpp
@@ -36,14 +36,14 @@ ExampleDeviceOperation::MultiCore::cached_program_t ExampleDeviceOperation::Mult
     auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
         tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tiles);
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_src0_config =
         tt::tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
             .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t output_cb_index = 16;  // output_tensor operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     uint32_t num_output_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_output_config =
         tt::tt_metal::CircularBufferConfig(
@@ -76,7 +76,7 @@ ExampleDeviceOperation::MultiCore::cached_program_t ExampleDeviceOperation::Mult
     bool math_approx_mode = false;
     auto eltwise_unary_kernel_group_1_id = tt::tt_metal::CreateKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
         core_group_1,
         tt::tt_metal::ComputeConfig{
             .math_fidelity = MathFidelity::HiFi4,
@@ -91,7 +91,7 @@ ExampleDeviceOperation::MultiCore::cached_program_t ExampleDeviceOperation::Mult
 
         auto eltwise_unary_kernel_group_2_id = tt::tt_metal::CreateKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+            "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
             core_group_2,
             tt::tt_metal::ComputeConfig{
                 .math_fidelity = MathFidelity::HiFi4,
diff --git a/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
index 6f7283be91c..1adc3c57f2c 100644
--- a/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example/device/single_core_program_factory.cpp
@@ -36,14 +36,14 @@ ExampleDeviceOperation::SingleCore::cached_program_t ExampleDeviceOperation::Sin
     auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
         tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tiles);
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_src0_config =
         tt::tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
             .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t output_cb_index = 16;  // output_tensor operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     uint32_t num_output_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_output_config =
         tt::tt_metal::CircularBufferConfig(
@@ -76,7 +76,7 @@ ExampleDeviceOperation::SingleCore::cached_program_t ExampleDeviceOperation::Sin
     bool math_approx_mode = false;
     auto eltwise_unary_kernel_group_1_id = tt::tt_metal::CreateKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
         core_group_1,
         tt::tt_metal::ComputeConfig{
             .math_fidelity = MathFidelity::HiFi4,
@@ -91,7 +91,7 @@ ExampleDeviceOperation::SingleCore::cached_program_t ExampleDeviceOperation::Sin
 
         auto eltwise_unary_kernel_group_2_id = tt::tt_metal::CreateKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+            "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
             core_group_2,
             tt::tt_metal::ComputeConfig{
                 .math_fidelity = MathFidelity::HiFi4,
diff --git a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
index e4daa118cb0..f0ec696f69c 100644
--- a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/device/single_core_program_factory.cpp
@@ -39,14 +39,14 @@ ExampleMultipleReturnDeviceOperation::SingleCore::cached_program_t ExampleMultip
     auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
         split_work_to_cores(compute_with_storage_grid_size, num_tiles);
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_src0_config =
         tt::tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
             .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t output_cb_index = 16;  // output_tensor operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_2;
     uint32_t num_output_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_output_config =
         tt::tt_metal::CircularBufferConfig(
@@ -81,7 +81,7 @@ ExampleMultipleReturnDeviceOperation::SingleCore::cached_program_t ExampleMultip
     bool math_approx_mode = false;
     auto eltwise_unary_kernel_group_1_id = tt::tt_metal::CreateKernel(
         program,
-        "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+        "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
         core_group_1,
         tt::tt_metal::ComputeConfig{
             .math_fidelity = MathFidelity::HiFi4,
@@ -96,7 +96,7 @@ ExampleMultipleReturnDeviceOperation::SingleCore::cached_program_t ExampleMultip
 
         auto eltwise_unary_kernel_group_2_id = tt::tt_metal::CreateKernel(
             program,
-            "tt_metal/kernels/compute/eltwise_sfpu.cpp",
+            "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/compute/eltwise_sfpu.cpp",
             core_group_2,
             tt::tt_metal::ComputeConfig{
                 .math_fidelity = MathFidelity::HiFi4,
diff --git a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/example_multiple_return.cpp b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/example_multiple_return.cpp
index 12b28e54c70..07ee156d2ad 100644
--- a/ttnn/cpp/ttnn/operations/examples/example_multiple_return/example_multiple_return.cpp
+++ b/ttnn/cpp/ttnn/operations/examples/example_multiple_return/example_multiple_return.cpp
@@ -5,6 +5,8 @@
 
 #include "example_multiple_return.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::examples {
 
 std::vector<std::optional<Tensor>> CompositeExampleMutipleReturnOperation::invoke(const Tensor& input_tensor, bool return_output1, bool return_output2) {
diff --git a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp
index f884df1f18a..e83aa961d5a 100644
--- a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.cpp
@@ -16,6 +16,8 @@
 #include "ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.hpp"
 #include "ttnn/tensor/tensor.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::auto_format {
 
 Tensor AutoFormat::move_tensor_to_device(const Tensor& input, Device* device, const MemoryConfig& mem_config) {
diff --git a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.hpp b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.hpp
index 4eb274bb820..bf79c0069b8 100644
--- a/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/auto_format/auto_format.hpp
@@ -18,17 +18,17 @@ namespace ttnn::operations::experimental::auto_format{
 struct FormatParams {
     tt::tt_metal::LegacyShape pad_shape;
     float pad_value;
-    Layout target_layout;
+    tt::tt_metal::Layout target_layout;
 };
 
 class AutoFormat {
     private:
-        inline static Device* device = nullptr;
+        inline static tt::tt_metal::Device* device = nullptr;
 
         AutoFormat() {}
     public:
-        static void SetDefaultDevice(Device * dev) { device = dev; }
-        static Device * GetDefaultDevice() { return device; }
+        static void SetDefaultDevice(tt::tt_metal::Device * dev) { device = dev; }
+        static tt::tt_metal::Device * GetDefaultDevice() { return device; }
 
 
         static tt::tt_metal::LegacyShape pad_to_tile_shape(const tt::tt_metal::LegacyShape& unpadded_shape,
@@ -68,7 +68,7 @@ class AutoFormat {
             return padded_shape;
         }
 
-        static tt::tt_metal::LegacyShape pad_to_legal_shape(const tt::tt_metal::LegacyShape& unpadded_shape, Layout layout) {
+        static tt::tt_metal::LegacyShape pad_to_legal_shape(const tt::tt_metal::LegacyShape& unpadded_shape, tt::tt_metal::Layout layout) {
             tt::tt_metal::LegacyShape padded_shape = unpadded_shape;
             switch (layout) {
                 case Layout::ROW_MAJOR: padded_shape = pad_to_rm_shape(unpadded_shape); break;
@@ -87,7 +87,7 @@ class AutoFormat {
             return (shape[3] % 2 == 0);
         }
 
-        static bool legal_device_shape(const tt::tt_metal::LegacyShape& shape, Layout layout) {
+        static bool legal_device_shape(const tt::tt_metal::LegacyShape& shape, tt::tt_metal::Layout layout) {
             switch (layout) {
                 case Layout::ROW_MAJOR: return legal_rm_shape(shape);
                 case Layout::TILE: return legal_tile_shape(shape);
@@ -96,7 +96,7 @@ class AutoFormat {
         }
 
 
-        static bool check_input_tensor_format(const Tensor &a, const tt::tt_metal::LegacyShape& shape, Layout target_layout = Layout::TILE) {
+        static bool check_input_tensor_format(const Tensor &a, const tt::tt_metal::LegacyShape& shape, tt::tt_metal::Layout target_layout = Layout::TILE) {
             if (a.get_layout() == target_layout && a.get_legacy_shape() == shape && a.storage_type() == StorageType::DEVICE) {
                 return true;
             }
@@ -107,15 +107,15 @@ class AutoFormat {
         // are not quite ready. So here we basically just put the tensor back on device.
         // Used in backward_ops.cpp
         // See: Remove auto format within permute_op.cpp #9404
-        static Tensor move_tensor_to_device_and_pad(const Tensor& input, Device *device, Layout target_layout, std::optional<MemoryConfig> target_mem_config);
+        static Tensor move_tensor_to_device_and_pad(const Tensor& input, tt::tt_metal::Device *device, tt::tt_metal::Layout target_layout, std::optional<tt::tt_metal::MemoryConfig> target_mem_config);
 
-        static Tensor move_tensor_to_device(const Tensor &input, Device * device, const MemoryConfig& mem_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG);
+        static Tensor move_tensor_to_device(const Tensor &input, tt::tt_metal::Device * device, const tt::tt_metal::MemoryConfig& mem_config = tt::tt_metal::operation::DEFAULT_OUTPUT_MEMORY_CONFIG);
 
-        static Tensor move_tensor_to_mem_config(const Tensor &input, const MemoryConfig& mem_config);
+        static Tensor move_tensor_to_mem_config(const Tensor &input, const tt::tt_metal::MemoryConfig& mem_config);
 
-        static Tensor format_input_tensor(const Tensor &input, Device * device, const tt::tt_metal::LegacyShape& padded_shape, float pad_value, Layout target_layout, std::optional<MemoryConfig> target_mem_config = std::nullopt);
+        static Tensor format_input_tensor(const Tensor &input, tt::tt_metal::Device * device, const tt::tt_metal::LegacyShape& padded_shape, float pad_value, tt::tt_metal::Layout target_layout, std::optional<tt::tt_metal::MemoryConfig> target_mem_config = std::nullopt);
 
-        static Tensor format_output_tensor(const Tensor &output, const tt::tt_metal::LegacyShape& shape, Device* device, Layout target_layout, std::optional<MemoryConfig> target_mem_config = std::nullopt);
+        static Tensor format_output_tensor(const Tensor &output, const tt::tt_metal::LegacyShape& shape, tt::tt_metal::Device* device, Layout target_layout, std::optional<MemoryConfig> target_mem_config = std::nullopt);
 };
 
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/kernels/datacopy.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/kernels/datacopy.cpp
index 5d5447c645a..5ebc7d803af 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/kernels/datacopy.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/kernels/datacopy.cpp
@@ -50,7 +50,7 @@ void kernel_main() {
     const uint32_t* matmul_cores_noc_coords = (uint32_t*)get_arg_addr(increment_arg_idx(rt_args_idx, 2 * num_matmul_cores_to_signal)); // Matmul core NOC coordinates [x1, y1, x2, y2...]
 
     // Setup buffers
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
     const DataFormat in0_df = get_dataformat(cb_id_in0);
 
     // DRAM reader in
diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
index 792d60a2d6e..cfd21401f32 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_matmul/device/multi_core/all_gather_matmul_op_multi_core.cpp
@@ -125,7 +125,7 @@ DatacopyParams setup_datacopy(
         static_cast<uint32_t>(matmul_fused_op_signaler.num_fused_op_cores_to_signal)
     };
 
-    uint32_t cb_id_in0 = tt::CB::c_in0;
+    uint32_t cb_id_in0 = tt::CBIndex::c_0;
     tt::tt_metal::CircularBufferConfig cb_in0_config =
         tt::tt_metal::CircularBufferConfig(
             page_size * datacopy_buffer_size, {{cb_id_in0, cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.cpp
index 522ae71e180..b706c36f342 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_device_operation.cpp
@@ -5,6 +5,8 @@
 #include "attn_matmul_device_operation.hpp"
 #include "tt_metal/common/work_split.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::matmul {
 
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
index 2ce5db247e2..39ff13bcba3 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/attn_matmul_program_factory.cpp
@@ -11,8 +11,9 @@
 
 namespace ttnn::operations::experimental::matmul {
 
-using namespace tt::constants;
 using namespace tt;
+using namespace tt::constants;
+using namespace tt::tt_metal;
 
 operation::ProgramWithCallbacks multi_core_attn_matmul(const Tensor &a, const Tensor &b, Tensor& output, std::optional<const uint32_t> num_tokens, std::optional<const bool> transpose_hw, CoreCoord compute_with_storage_grid_size, ttnn::DeviceComputeKernelConfig compute_kernel_config) {
 
@@ -89,34 +90,34 @@ operation::ProgramWithCallbacks multi_core_attn_matmul(const Tensor &a, const Te
     uint32_t src1_addr = src1_buffer->address();
     uint32_t dst_addr = dst_buffer->address();
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t cb0_num_input_tiles = Kt * 2;
     tt::tt_metal::CircularBufferConfig src0_cb_config = tt::tt_metal::CircularBufferConfig(cb0_num_input_tiles * in0_single_tile_size, {{src0_cb_index, in0_data_format}})
 		.set_page_size(src0_cb_index, in0_single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_device_cores, src0_cb_config);
 
-    uint32_t src1_cb_index = tt::CB::c_in1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     uint32_t cb1_num_input_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(cb1_num_input_tiles * in1_single_tile_size, {{src1_cb_index, in1_data_format}})
 		.set_page_size(src1_cb_index, in1_single_tile_size);
     auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, all_device_cores, cb_src1_config);
 
-    uint32_t cb_intermed0_index = tt::CB::c_intermed0;
+    uint32_t cb_intermed0_index = tt::CBIndex::c_24;
     tt::tt_metal::CircularBufferConfig cb_interm0_config = tt::tt_metal::CircularBufferConfig(1 * interm_single_tile_size, {{cb_intermed0_index, interm_data_format}})
 		.set_page_size(cb_intermed0_index, interm_single_tile_size);
     auto cb_interm0 = tt::tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm0_config);
 
-    uint32_t cb_intermed1_index = tt::CB::c_intermed1;
+    uint32_t cb_intermed1_index = tt::CBIndex::c_25;
     tt::tt_metal::CircularBufferConfig cb_interm1_config = tt::tt_metal::CircularBufferConfig(1 * interm_single_tile_size, {{cb_intermed1_index, interm_data_format}})
 		.set_page_size(cb_intermed1_index, interm_single_tile_size);
     auto cb_interm1 = tt::tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm1_config);
 
-    uint32_t cb_intermed2_index = tt::CB::c_intermed2;
+    uint32_t cb_intermed2_index = tt::CBIndex::c_26;
     tt::tt_metal::CircularBufferConfig cb_interm2_config = tt::tt_metal::CircularBufferConfig(1 * interm_single_tile_size, {{cb_intermed2_index, interm_data_format}})
 		.set_page_size(cb_intermed2_index, interm_single_tile_size);
     auto cb_interm2 = tt::tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm2_config);
 
-    uint32_t output_cb_index = tt::CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_output_config = tt::tt_metal::CircularBufferConfig(num_output_tiles * output_single_tile_size, {{output_cb_index, output_data_format}})
 		.set_page_size(output_cb_index, output_single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/kernels/compute/transformer_attn_matmul.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/kernels/compute/transformer_attn_matmul.cpp
index dd0bd4c0a9f..025dd76292a 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/kernels/compute/transformer_attn_matmul.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/attn_matmul/device/kernels/compute/transformer_attn_matmul.cpp
@@ -58,7 +58,7 @@ void MAIN {
             tile_regs_release();
             cb_push_back(cb_intermed0, onetile);
 
-            // untilize tile and write to CB::c_intermed1
+            // untilize tile and write to CBIndex::c_25
             reconfig_data_format_srca(cb_in1, cb_intermed0);
             cb_wait_front(cb_intermed0, onetile);
             untilize_init_short(cb_intermed0);
@@ -79,7 +79,7 @@ void MAIN {
         cb_wait_front(cb_intermed2, onetile);
         cb_reserve_back(out_cb_id, onetile);
 
-        // tilize CB::intermed2 and write to CB::c_out0
+        // tilize CB::intermed2 and write to CBIndex::c_16
         tilize_init_short_with_dt(cb_in1, cb_intermed2, onetile);
         tilize_block(cb_intermed2, onetile, out_cb_id);
         cb_push_back(out_cb_id, onetile);
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.cpp
index e2cfe3a80b6..80ba3f82661 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_device_operation.cpp
@@ -6,6 +6,8 @@
 #include "tt_metal/common/work_split.hpp"
 #include "tt_metal/common/constants.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::matmul {
 
 void GroupAttnMatmulDeviceOperation::validate(const std::vector<Tensor>& input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
index 306d4271d81..e358b849394 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp
@@ -11,8 +11,9 @@
 
 namespace ttnn::operations::experimental::matmul {
 
-using namespace tt::constants;
 using namespace tt;
+using namespace tt::constants;
+using namespace tt::tt_metal;
 
 operation::ProgramWithCallbacks multi_core_group_attn_matmul(const Tensor &a, const Tensor &b, Tensor& output, std::optional<const uint32_t> num_tokens, std::optional<const bool> transpose_hw, const uint32_t out_subblock_w, CoreCoord compute_with_storage_grid_size, const bool row_major, ttnn::DeviceComputeKernelConfig compute_kernel_config) {
 
@@ -107,7 +108,7 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul(const Tensor &a, co
     const bool output_is_sharded = output.is_sharded();
 
     // CB for in0 (ie. q_heads)
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     CBHandle cb_src0;
     if (in0_is_sharded) {
         uint32_t cb0_num_input_tiles = a.shard_spec().value().numel() / TILE_HW; // Should be full MtKt and C should be 1
@@ -123,7 +124,7 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul(const Tensor &a, co
 
     // CB for interleaved/sharded KV heads for mcasting; mcasts to same CB
     // Then, push all KV_HEADS to compute and compute chooses which head to use for matmul
-    uint32_t src1_cb_index = tt::CB::c_in1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     uint32_t cb1_num_input_tiles = 2 * in1_block_num_tiles;
     tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(cb1_num_input_tiles * in1_single_tile_size, {{src1_cb_index, in1_data_format}})
 		.set_page_size(src1_cb_index, in1_single_tile_size);
@@ -132,7 +133,7 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul(const Tensor &a, co
     // CB for sharded KV heads
     CBHandle cb_src2 = 0;  // unused if KV heads is interleaved
     if (in1_is_sharded) {
-        uint32_t src2_cb_index = tt::CB::c_in2;
+        uint32_t src2_cb_index = tt::CBIndex::c_2;
         uint32_t cb2_num_input_tiles = b.shard_spec().value().numel() / TILE_HW; // Should be full CKtNt and batch must be 32
         tt::tt_metal::CircularBufferConfig cb_src2_config = tt::tt_metal::CircularBufferConfig(cb2_num_input_tiles * in1_single_tile_size, {{src2_cb_index, in1_data_format}})
 		    .set_page_size(src2_cb_index, in1_single_tile_size).set_globally_allocated_address(*src1_buffer);
@@ -141,18 +142,18 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul(const Tensor &a, co
 
     // Intermediate CBs for handling untilizing, copying rows, and tilizing to output CB
     uint32_t interm_cb_num_tiles = 2 * intermediate_num_tiles; // TODO: Generalize; double buffering should help when we are not reader bound
-    uint32_t cb_intermed0_index = tt::CB::c_intermed0;
+    uint32_t cb_intermed0_index = tt::CBIndex::c_24;
     tt::tt_metal::CircularBufferConfig cb_interm0_config = tt::tt_metal::CircularBufferConfig(interm_cb_num_tiles * interm_single_tile_size, {{cb_intermed0_index, interm_data_format}})
 		.set_page_size(cb_intermed0_index, interm_single_tile_size);
     auto cb_interm0 = tt::tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm0_config);
 
-    uint32_t cb_intermed1_index = tt::CB::c_intermed1;
+    uint32_t cb_intermed1_index = tt::CBIndex::c_25;
     tt::tt_metal::CircularBufferConfig cb_interm1_config = tt::tt_metal::CircularBufferConfig(MtNt * interm_single_tile_size, {{cb_intermed1_index, interm_data_format}})
 		.set_page_size(cb_intermed1_index, interm_single_tile_size);
     auto cb_interm1 = tt::tt_metal::CreateCircularBuffer(program, all_device_cores, cb_interm1_config);
 
     // CB for output (if sharded, full num tiles per core)
-    uint32_t output_cb_index = tt::CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     CBHandle cb_output;
     if (output_is_sharded) {
         uint32_t num_output_tiles = output.shard_spec().value().numel() / TILE_HW; // Should be full MtNt and C should be 1
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/kernels/compute/transformer_group_attn_matmul.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/kernels/compute/transformer_group_attn_matmul.cpp
index 2e56726ad82..7fb014c9f08 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/kernels/compute/transformer_group_attn_matmul.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/kernels/compute/transformer_group_attn_matmul.cpp
@@ -158,7 +158,7 @@ void MAIN {
 
             cb_reserve_back(out_cb_id, out_num_tiles);
 
-            // tilize CB::intermed1 and write to CB::c_out0
+            // tilize CB::intermed1 and write to CBIndex::c_16
             tilize_init_short_with_dt(cb_in1, cb_intermed1, out_num_tiles);
             tilize_block(cb_intermed1, out_num_tiles, out_cb_id);
             cb_push_back(out_cb_id, out_num_tiles);
diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp
index 0b96b2ac597..e9dc2067b63 100644
--- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.cpp
@@ -7,6 +7,7 @@
 #include "ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul_pybind.hpp"
 #include "ttnn/operations/experimental/matmul/group_attn_matmul/group_attn_matmul.hpp"
 
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::experimental::matmul::detail {
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_cache_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_cache_operation.cpp
index 435ffb56c14..0d9c3527b4b 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_cache_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_cache_operation.cpp
@@ -8,6 +8,8 @@
 #include "paged_fused_update_cache_program_factory.hpp"
 #include "paged_fill_cache_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::paged_cache {
 
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
index bf65a8965b3..2527e3d08be 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.cpp
@@ -10,6 +10,8 @@
 #include "tt_metal/common/work_split.hpp"
 #include "ttnn/operations/experimental/paged_cache/device/paged_fill_cache_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::paged_cache::detail {
 
 using namespace tt::constants;
@@ -63,8 +65,8 @@ operation::ProgramWithCallbacks paged_fill_cache_multi_core(const Tensor& cache_
     std::tie(num_cores, all_cores, core_group_1, core_group_2, num_blocks_per_core_group_1, num_blocks_per_core_group_2) = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size,  num_blocks_of_work, row_major);
     uint32_t num_input_tiles = Wt * 2; // double buffered
 
-    tt::CB src0_cb_index = tt::CB::c_in0;
-    tt::CB page_table_cb_index = tt::CB::c_in1;
+    tt::CBIndex src0_cb_index = tt::CBIndex::c_0;
+    tt::CBIndex page_table_cb_index = tt::CBIndex::c_1;
     create_cb(src0_cb_index, program, all_cores, single_tile_size, num_input_tiles, cb_data_format);
     create_cb(page_table_cb_index, program, all_cores, page_table_stick_size_B, 1, page_table_data_format);
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.cpp
index f083b77e991..518e2cc599e 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.cpp
@@ -10,6 +10,8 @@
 #include "tt_metal/common/work_split.hpp"
 #include "ttnn/operations/experimental/paged_cache/device/paged_fused_update_cache_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::paged_cache::detail {
 
 using namespace tt::constants;
@@ -234,15 +236,15 @@ operation::ProgramWithCallbacks paged_fused_update_cache_multi_core(const Tensor
     uint32_t num_interm_tiles = 2 * Wt; // double buffered
     uint32_t num_output_tiles = B * Wt;
 
-    const tt::CB cache_cb_index = CB::c_in0;
-    const tt::CB src1_cb_index = CB::c_in1;
-    const tt::CB src2_cb_index = CB::c_in2;
-    const tt::CB cb_index_id = CB::c_in3;
-    const tt::CB cb_pagetable_id = CB::c_in4;
-    const tt::CB intermed0_cb_index = CB::c_intermed0;
-    const tt::CB intermed1_cb_index = CB::c_intermed1;
-    const tt::CB intermed2_cb_index = CB::c_intermed2;
-    const tt::CB output_cb_index = CB::c_out0;
+    const tt::CBIndex cache_cb_index = CBIndex::c_0;
+    const tt::CBIndex src1_cb_index = CBIndex::c_1;
+    const tt::CBIndex src2_cb_index = CBIndex::c_2;
+    const tt::CBIndex cb_index_id = CBIndex::c_3;
+    const tt::CBIndex cb_pagetable_id = CBIndex::c_4;
+    const tt::CBIndex intermed0_cb_index = CBIndex::c_24;
+    const tt::CBIndex intermed1_cb_index = CBIndex::c_25;
+    const tt::CBIndex intermed2_cb_index = CBIndex::c_26;
+    const tt::CBIndex output_cb_index = CBIndex::c_16;
 
     create_cb(cache_cb_index, program, all_cores, cache_single_tile_size, num_cache_tiles, cache_cb_data_format);
     auto [_1, cb_src1] = create_cb(src1_cb_index, program, input1_cores, input_single_tile_size, num_input_tiles, input_cb_data_format, in1_buffer_address);
diff --git a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.cpp
index 178634f494a..9f735d283a1 100644
--- a/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.cpp
@@ -10,6 +10,8 @@
 #include "tt_metal/common/work_split.hpp"
 #include "ttnn/operations/experimental/paged_cache/device/paged_update_cache_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::paged_cache::detail {
 
 using namespace tt::constants;
@@ -113,14 +115,14 @@ operation::ProgramWithCallbacks paged_update_cache_multi_core(const Tensor& cach
     uint32_t num_interm_tiles = 2 * Wt; // double buffered
     uint32_t num_output_tiles = B * Wt;
 
-    const tt::CB src0_cb_index = CB::c_in0;
-    const tt::CB src1_cb_index = CB::c_in1;
-    const tt::CB cb_index_id = CB::c_in2;
-    const tt::CB cb_pagetable_id = CB::c_in3;
-    const tt::CB intermed0_cb_index = CB::c_intermed0;
-    const tt::CB intermed1_cb_index = CB::c_intermed1;
-    const tt::CB intermed2_cb_index = CB::c_intermed2;
-    const tt::CB output_cb_index = CB::c_out0;
+    const tt::CBIndex src0_cb_index = CBIndex::c_0;
+    const tt::CBIndex src1_cb_index = CBIndex::c_1;
+    const tt::CBIndex cb_index_id = CBIndex::c_2;
+    const tt::CBIndex cb_pagetable_id = CBIndex::c_3;
+    const tt::CBIndex intermed0_cb_index = CBIndex::c_24;
+    const tt::CBIndex intermed1_cb_index = CBIndex::c_25;
+    const tt::CBIndex intermed2_cb_index = CBIndex::c_26;
+    const tt::CBIndex output_cb_index = CBIndex::c_16;
 
     create_cb(src0_cb_index, program, all_cores, cache_single_tile_size, num_cache_tiles, cache_cb_data_format);
     auto [_, cb_src1] = create_cb(src1_cb_index, program, all_cores, input_single_tile_size, num_input_tiles, input_cb_data_format, in1_buffer_address);
diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.cpp
index f8232371f10..0f3339cae0a 100644
--- a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_op.cpp
@@ -5,6 +5,8 @@
 #include "plusone_op.hpp"
 #include "plusone_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental {
 
 void PlusOne::validate_with_output_tensors(
diff --git a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp
index 79c24ea431d..ace4b90a4bc 100644
--- a/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/plusone/device/plusone_program_factory.cpp
@@ -13,6 +13,7 @@
 namespace ttnn::operations::experimental::detail {
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 operation::ProgramWithCallbacks plusone_single_core(
     const Tensor &input) {
@@ -33,7 +34,7 @@ operation::ProgramWithCallbacks plusone_single_core(
     const auto &input_shape = input.get_legacy_shape();
     const uint32_t W = input_shape[0];
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_units = W;
     uint32_t aligned_input_unit_size = round_up_to_mul32(num_input_units * input_unit_size);
     tt::tt_metal::CircularBufferConfig cb_src0_config =
diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.cpp
index 73dec69b234..a8e6004be53 100644
--- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/fast_reduce_nc_program_factory.cpp
@@ -93,23 +93,23 @@ operation::ProgramWithCallbacks reduce_nc_factory(const ttnn::Tensor &input, con
     //                         CircularBuffer Setup
     ////////////////////////////////////////////////////////////////////////////
     tt_metal::CircularBufferConfig cb_scr0_config =
-        tt_metal::CircularBufferConfig(in0_t*single_tile_size, {{CB::c_in0, cb_data_format}})
-            .set_page_size(CB::c_in0, single_tile_size);
+        tt_metal::CircularBufferConfig(in0_t*single_tile_size, {{CBIndex::c_0, cb_data_format}})
+            .set_page_size(CBIndex::c_0, single_tile_size);
     auto cb_scr0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_scr0_config);
 
     tt_metal::CircularBufferConfig cb_scr1_config =
-        tt_metal::CircularBufferConfig(in1_t*cb_1_tile_size, {{CB::c_in1, cb_1_data_format}})
-            .set_page_size(CB::c_in1, cb_1_tile_size);
+        tt_metal::CircularBufferConfig(in1_t*cb_1_tile_size, {{CBIndex::c_1, cb_1_data_format}})
+            .set_page_size(CBIndex::c_1, cb_1_tile_size);
     auto cb_scr1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_scr1_config);
 
     tt_metal::CircularBufferConfig cb_intermed0_config =
-        tt_metal::CircularBufferConfig(intermed0_t*intermed_cb_single_tile_size, {{CB::c_intermed0, intermed_cb_data_format}})
-            .set_page_size(CB::c_intermed0, intermed_cb_single_tile_size);
+        tt_metal::CircularBufferConfig(intermed0_t*intermed_cb_single_tile_size, {{CBIndex::c_24, intermed_cb_data_format}})
+            .set_page_size(CBIndex::c_24, intermed_cb_single_tile_size);
     auto cb_intermed0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed0_config);
 
     tt_metal::CircularBufferConfig cb_output_config =
-        tt_metal::CircularBufferConfig(out0_t*single_tile_size, {{CB::c_out0, cb_data_format}})
-            .set_page_size(CB::c_out0, single_tile_size);
+        tt_metal::CircularBufferConfig(out0_t*single_tile_size, {{CBIndex::c_16, cb_data_format}})
+            .set_page_size(CBIndex::c_16, single_tile_size);
     auto cb_output = tt_metal::CreateCircularBuffer(program, all_cores, cb_output_config);
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/kernels/reduce_nc.cpp b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/kernels/reduce_nc.cpp
index 7c59ebfd264..8d53309c39b 100644
--- a/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/kernels/reduce_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/reduction/fast_reduce_nc/device/kernels/reduce_nc.cpp
@@ -12,9 +12,9 @@ void MAIN {
     constexpr uint32_t num_input_tiles = get_compile_time_arg_val(1);
     constexpr uint32_t input_granularity = get_compile_time_arg_val(2);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_op.cpp
index 34a9811bcc7..e7468ce8114 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_op.cpp
@@ -6,6 +6,8 @@
 
 #include "hc_sum_reduce_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::ssm {
 
 void HCSumReduce::validate(const std::vector<Tensor>& input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp
index bd942daf22b..4351c9a91e2 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/device/hc_sum_reduce_program_factory.cpp
@@ -10,6 +10,7 @@
 namespace ttnn::operations::experimental::ssm::detail {
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 operation::ProgramWithCallbacks multi_core_ssm_1d_sum_reduce(
     const Tensor& a, Tensor& output, MathFidelity math_fidelity, CoreCoord compute_with_storage_grid_size) {
@@ -56,30 +57,30 @@ operation::ProgramWithCallbacks multi_core_ssm_1d_sum_reduce(
     const uint32_t cb_size = 2;
 
     // Reader writes input tiles to this
-    const uint32_t input_cb_id = tt::CB::c_in0;
+    const uint32_t input_cb_id = tt::CBIndex::c_0;
     const auto input_cb = create_circular_buffer(input_cb_id, cb_size, input_tile_size, input_format);
 
     // Reader writes scaling tile to this CB. We need it because the reduce LLK requires a scaling factor tile.
-    const uint32_t scalar_cb_id = tt::CB::c_in2;
+    const uint32_t scalar_cb_id = tt::CBIndex::c_2;
     const auto scalar_cb = create_circular_buffer(scalar_cb_id, cb_size, intermediary_tile_size, intermediary_format);
 
     // Compute writes transposed tile (loopback)
-    const uint32_t intermed_cb_id0 = tt::CB::c_intermed0;
+    const uint32_t intermed_cb_id0 = tt::CBIndex::c_24;
     const auto intermed_cb0 =
         create_circular_buffer(intermed_cb_id0, cb_size, intermediary_tile_size, intermediary_format);
 
     // Compute writes reduced tile for writer
-    const uint32_t intermed_cb_id1 = tt::CB::c_intermed1;
+    const uint32_t intermed_cb_id1 = tt::CBIndex::c_25;
     const auto intermed_cb1 =
         create_circular_buffer(intermed_cb_id1, cb_size, intermediary_tile_size, intermediary_format);
 
     // Writer concats and writes back to compute
-    const uint32_t intermed_cb_id2 = tt::CB::c_intermed2;
+    const uint32_t intermed_cb_id2 = tt::CBIndex::c_26;
     const auto intermed_cb2 =
         create_circular_buffer(intermed_cb_id2, cb_size, intermediary_tile_size, intermediary_format);
 
     // Compute transposes and writes back to writer
-    const uint32_t output_cb_id = tt::CB::c_out0;
+    const uint32_t output_cb_id = tt::CBIndex::c_16;
     const auto output_cb = create_circular_buffer(output_cb_id, cb_size, input_tile_size, input_format);
 
     const bfloat16 bfloat_scaler_value = bfloat16(1.0f);
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp
index bb6063716d8..36955ec9e83 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/hc_sum_reduce/hc_sum_reduce.cpp
@@ -7,6 +7,8 @@
 #include "device/hc_sum_reduce_op.hpp"
 #include "ttnn/common/constants.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::ssm {
 
 ttnn::Tensor ExecuteHCSumReduce::invoke(
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.cpp
index 118e6a6fcc0..f0d2c4f80f8 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_op.cpp
@@ -6,6 +6,8 @@
 #include "tt_metal/common/constants.hpp"
 #include "prefix_scan_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::ssm {
 
 void PrefixScan::validate(const std::vector<Tensor>& input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_program_factory.cpp
index 575285937c4..2c7a1615ca7 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/device/prefix_scan_program_factory.cpp
@@ -6,6 +6,8 @@
 
 #include "ttnn/tensor/tensor.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::ssm::detail {
 
 using namespace tt::constants;
@@ -56,43 +58,43 @@ operation::ProgramWithCallbacks multi_core_ssm_prefix_scan(
     const uint32_t num_tiles_in_chunk = 32;
     const uint32_t num_chunks_per_row = tt::div_up(total_tiles_per_row, num_tiles_in_chunk);
 
-    const uint32_t cb_a_in_id = tt::CB::c_in0;
+    const uint32_t cb_a_in_id = tt::CBIndex::c_0;
     const auto cb_a_in = create_circular_buffer(cb_a_in_id, total_tiles, input_tile_size, input_format, a_buffer);
 
-    const uint32_t cb_bx_in_id = tt::CB::c_in1;
+    const uint32_t cb_bx_in_id = tt::CBIndex::c_1;
     const auto cb_bx_in = create_circular_buffer(cb_bx_in_id, total_tiles, input_tile_size, input_format, bx_buffer);
 
     // Hidden state is in row-major so must be bfloat16
-    const uint32_t cb_h_in_id = tt::CB::c_in2;
+    const uint32_t cb_h_in_id = tt::CBIndex::c_2;
     const auto cb_h_in =
         create_circular_buffer(cb_h_in_id, num_chunks_per_row, intermediary_tile_size, intermediary_format, h_buffer);
 
-    const uint32_t cb_out_id = tt::CB::c_out0;
+    const uint32_t cb_out_id = tt::CBIndex::c_16;
     const auto cb_out = create_circular_buffer(cb_out_id, total_tiles, input_tile_size, input_format, output_buffer);
 
     const uint32_t num_tiles_in_row_to_tile_cb = 32;  // Tilizing 32 tiles will pack tensor rows into seperate tiles
-    const uint32_t cb_a_tilize_in_id = tt::CB::c_intermed0;
+    const uint32_t cb_a_tilize_in_id = tt::CBIndex::c_24;
     const auto cb_a_tilize_in = create_circular_buffer(
         cb_a_tilize_in_id, num_tiles_in_row_to_tile_cb, intermediary_tile_size, intermediary_format);
 
-    const uint32_t cb_bx_tilize_in_id = tt::CB::c_intermed1;
+    const uint32_t cb_bx_tilize_in_id = tt::CBIndex::c_25;
     const auto cb_b_tilize_in = create_circular_buffer(
         cb_bx_tilize_in_id, num_tiles_in_row_to_tile_cb, intermediary_tile_size, intermediary_format);
 
-    const uint32_t cb_tilize_out_id = tt::CB::c_intermed2;
+    const uint32_t cb_tilize_out_id = tt::CBIndex::c_26;
     const auto cb_tilize_out = create_circular_buffer(
         cb_tilize_out_id, num_tiles_in_row_to_tile_cb, intermediary_tile_size, intermediary_format);
 
-    const uint32_t cb_h_prev_id = tt::CB::c_intermed3;
+    const uint32_t cb_h_prev_id = tt::CBIndex::c_27;
     const auto cb_h_prev = create_circular_buffer(cb_h_prev_id, 2, intermediary_tile_size, intermediary_format);
 
-    const uint32_t cb_ah_id = tt::CB::c_intermed4;
+    const uint32_t cb_ah_id = tt::CBIndex::c_28;
     const auto cb_ah = create_circular_buffer(cb_ah_id, 2, intermediary_tile_size, intermediary_format);
 
-    const uint32_t cb_h_id = tt::CB::c_intermed5;
+    const uint32_t cb_h_id = tt::CBIndex::c_29;
     const auto cb_h = create_circular_buffer(cb_h_id, 2, intermediary_tile_size, intermediary_format);
 
-    const uint32_t cb_h_acc_id = tt::CB::c_intermed7;
+    const uint32_t cb_h_acc_id = tt::CBIndex::c_31;
     const auto cb_h_acc =
         create_circular_buffer(cb_h_acc_id, num_chunks_per_row, intermediary_tile_size, intermediary_format);
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp
index e72abde40c7..71235041dd4 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/prefix_scan/prefix_scan.cpp
@@ -7,6 +7,8 @@
 #include "device/prefix_scan_op.hpp"
 #include "ttnn/common/constants.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::ssm {
 
 ttnn::Tensor ExecutePrefixScan::invoke(
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.cpp
index 26229a58a0f..ca429ab5659 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_op.cpp
@@ -7,6 +7,8 @@
 #include "repeat_and_interleave_eltwise_mul_program_factory.hpp"
 #include "tt_metal/common/constants.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::ssm {
 
 void RepeatAndInterleaveEltwiseMul::validate(const std::vector<Tensor>& input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp
index 4a52586555b..c50261cbc23 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/device/repeat_and_interleave_eltwise_mul_program_factory.cpp
@@ -10,6 +10,7 @@
 namespace ttnn::operations::experimental::ssm::detail {
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 operation::ProgramWithCallbacks multi_core_ssm_eltwise_mul(
     const Tensor& a,
@@ -55,14 +56,14 @@ operation::ProgramWithCallbacks multi_core_ssm_eltwise_mul(
         grid_to_cores(num_cores, compute_with_storage_grid_size.x, compute_with_storage_grid_size.y, row_major);
 
     // Create circular buffers
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t cb0_tiles = ONE_TILE * 2;  // double buffer
     tt::tt_metal::CircularBufferConfig cb_src0_config =
         tt::tt_metal::CircularBufferConfig(cb0_tiles * in0_single_tile_size, {{src0_cb_index, in0_data_format}})
             .set_page_size(src0_cb_index, in0_single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t src1_cb_index = tt::CB::c_in1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     uint32_t cb1_tiles = ONE_TILE * 2;  // double buffer
     tt::tt_metal::CircularBufferConfig cb_src1_config =
         tt::tt_metal::CircularBufferConfig(cb1_tiles * in1_single_tile_size, {{src1_cb_index, in1_data_format}})
@@ -79,25 +80,25 @@ operation::ProgramWithCallbacks multi_core_ssm_eltwise_mul(
 
     uint32_t interm_num_tiles = ONE_TILE * 2;  // double buffer
     uint32_t interm_cb_size = interm_num_tiles * interm_single_tile_size;
-    uint32_t cb_intermed0_index = tt::CB::c_intermed0;  // cb_in0_transposed
+    uint32_t cb_intermed0_index = tt::CBIndex::c_24;  // cb_in0_transposed
     tt::tt_metal::CircularBufferConfig cb_intermed0_config =
         tt::tt_metal::CircularBufferConfig(interm_cb_size, {{cb_intermed0_index, interm_data_format}})
             .set_page_size(cb_intermed0_index, interm_single_tile_size);
     auto cb_intermed0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed0_config);
 
-    uint32_t cb_intermed1_index = tt::CB::c_intermed1;  // cb_in1_transposed
+    uint32_t cb_intermed1_index = tt::CBIndex::c_25;  // cb_in1_transposed
     tt::tt_metal::CircularBufferConfig cb_intermed1_config =
         tt::tt_metal::CircularBufferConfig(interm_cb_size, {{cb_intermed1_index, interm_data_format}})
             .set_page_size(cb_intermed1_index, interm_single_tile_size);
     auto cb_intermed1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed1_config);
 
-    uint32_t cb_intermed2_index = tt::CB::c_intermed2;  // cb_in1_bcast_row
+    uint32_t cb_intermed2_index = tt::CBIndex::c_26;  // cb_in1_bcast_row
     tt::tt_metal::CircularBufferConfig cb_intermed2_config =
         tt::tt_metal::CircularBufferConfig(interm_cb_size, {{cb_intermed2_index, interm_data_format}})
             .set_page_size(cb_intermed2_index, interm_single_tile_size);
     auto cb_intermed2 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed2_config);
 
-    uint32_t cb_intermed3_index = tt::CB::c_intermed3;  // cb_out_transposed
+    uint32_t cb_intermed3_index = tt::CBIndex::c_27;  // cb_out_transposed
     tt::tt_metal::CircularBufferConfig cb_intermed3_config =
         tt::tt_metal::CircularBufferConfig(interm_cb_size, {{cb_intermed3_index, interm_data_format}})
             .set_page_size(cb_intermed3_index, interm_single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp
index 6b68059ef9b..7f60bbaa80f 100644
--- a/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/ssm/repeat_and_interleave_eltwise_mul/repeat_and_interleave_eltwise_mul.cpp
@@ -7,6 +7,8 @@
 #include "device/repeat_and_interleave_eltwise_mul_op.hpp"
 #include "ttnn/common/constants.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::ssm {
 
 ttnn::Tensor ExecuteRepeatAndInterleaveEltwiseMul::invoke(
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp
index 38ab2561ca6..e9b87130234 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp
@@ -115,25 +115,25 @@ namespace ttnn::operations::experimental::transformer {
         uint32_t v_size = block_ht*num_tiles_per_group[2]*single_tile_size*groups_per_block;
 
         // qkv tensor
-        auto c_in0_config = CircularBufferConfig(input_size, {{CB::c_in0, data_format}}).set_page_size(CB::c_in0, single_tile_size).set_globally_allocated_address(*input_tensor.buffer());
+        auto c_in0_config = CircularBufferConfig(input_size, {{CBIndex::c_0, data_format}}).set_page_size(CBIndex::c_0, single_tile_size).set_globally_allocated_address(*input_tensor.buffer());
         auto cb_in0_id = CreateCircularBuffer(program, all_cores, c_in0_config);
 
         // q sharded
-        auto c_out0_config = CircularBufferConfig(q_size, {{CB::c_out0, data_format}})
-            .set_page_size(CB::c_out0, single_tile_size).set_globally_allocated_address(*output[0].buffer());
+        auto c_out0_config = CircularBufferConfig(q_size, {{CBIndex::c_16, data_format}})
+            .set_page_size(CBIndex::c_16, single_tile_size).set_globally_allocated_address(*output[0].buffer());
         auto cb_out0_id = CreateCircularBuffer( program, all_cores, c_out0_config );
         // k sharded
-        auto c_out1_config = CircularBufferConfig(k_size, {{CB::c_out1, data_format}})
-            .set_page_size(CB::c_out1, single_tile_size).set_globally_allocated_address(*output[1].buffer());
+        auto c_out1_config = CircularBufferConfig(k_size, {{CBIndex::c_17, data_format}})
+            .set_page_size(CBIndex::c_17, single_tile_size).set_globally_allocated_address(*output[1].buffer());
         auto cb_out1_id = CreateCircularBuffer( program, all_cores, c_out1_config );
         // v sharded
-        auto c_out2_config = CircularBufferConfig(v_size, {{CB::c_out2, data_format}})
-            .set_page_size(CB::c_out2, single_tile_size).set_globally_allocated_address(*output[2].buffer());
+        auto c_out2_config = CircularBufferConfig(v_size, {{CBIndex::c_18, data_format}})
+            .set_page_size(CBIndex::c_18, single_tile_size).set_globally_allocated_address(*output[2].buffer());
         auto cb_out2_id = CreateCircularBuffer( program, all_cores, c_out2_config );
 
         if (transpose_k) {
-            auto c_im0_config = CircularBufferConfig(k_size, {{CB::c_intermed0, data_format}})
-                .set_page_size(CB::c_intermed0, single_tile_size);
+            auto c_im0_config = CircularBufferConfig(k_size, {{CBIndex::c_24, data_format}})
+                .set_page_size(CBIndex::c_24, single_tile_size);
             auto cb_im0_id = CreateCircularBuffer(program, all_cores, c_im0_config);
         }
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/kernels/reader_create_qkv_heads_sharded.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/kernels/reader_create_qkv_heads_sharded.cpp
index 394bcdc3f07..001d0f26d83 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/kernels/reader_create_qkv_heads_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads/device/kernels/reader_create_qkv_heads_sharded.cpp
@@ -27,15 +27,15 @@ void kernel_main() {
     constexpr uint32_t k_size_per_group_t_bytes         = get_compile_time_arg_val(13); // total size of all K heads (expecting 1) in a group
     constexpr uint32_t v_size_per_group_t_bytes         = get_compile_time_arg_val(14); // total size of all V heads (expecting 1) in a group
 
-    constexpr uint32_t cb_in0  = tt::CB::c_in0;
+    constexpr uint32_t cb_in0  = tt::CBIndex::c_0;
 
-    constexpr uint32_t cb_outq = tt::CB::c_out0;
+    constexpr uint32_t cb_outq = tt::CBIndex::c_16;
     #ifdef TRANSPOSE_K_HEADS
-    constexpr uint32_t cb_outk = tt::CB::c_intermed0;
+    constexpr uint32_t cb_outk = tt::CBIndex::c_24;
     #else
-    constexpr uint32_t cb_outk = tt::CB::c_out1;
+    constexpr uint32_t cb_outk = tt::CBIndex::c_17;
     #endif
-    constexpr uint32_t cb_outv = tt::CB::c_out2;
+    constexpr uint32_t cb_outv = tt::CBIndex::c_18;
 
 
     // copy one entire head_dim tile, then go to next sequence tile and do another head_dim.
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp
index e1bcd87da92..3d742854ba3 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/create_qkv_heads_from_separate_tensors_program_factory.cpp
@@ -79,30 +79,30 @@ namespace ttnn::operations::experimental::transformer {
 
         // qkv tensor
         auto c_in0_config = CircularBufferConfig(q_size,
-        {{CB::c_in0, q_data_format}}).set_page_size(CB::c_in0, single_tile_size).set_globally_allocated_address(*input_tensor_q.buffer());
+        {{CBIndex::c_0, q_data_format}}).set_page_size(CBIndex::c_0, single_tile_size).set_globally_allocated_address(*input_tensor_q.buffer());
         auto cb_in0_id = CreateCircularBuffer(program, all_cores, c_in0_config);
 
         auto c_in1_config = CircularBufferConfig(kv_size, {
-            {CB::c_in1, kv_data_format}}
-            ).set_page_size(CB::c_in1, single_tile_size).set_globally_allocated_address(*input_tensor_kv.buffer());
+            {CBIndex::c_1, kv_data_format}}
+            ).set_page_size(CBIndex::c_1, single_tile_size).set_globally_allocated_address(*input_tensor_kv.buffer());
         auto cb_in1_id = CreateCircularBuffer(program, all_cores, c_in1_config);
 
         // q sharded
-        auto c_out0_config = CircularBufferConfig(q_size, {{CB::c_out0, q_data_format}})
-            .set_page_size(CB::c_out0, single_tile_size).set_globally_allocated_address(*output[0].buffer());
+        auto c_out0_config = CircularBufferConfig(q_size, {{CBIndex::c_16, q_data_format}})
+            .set_page_size(CBIndex::c_16, single_tile_size).set_globally_allocated_address(*output[0].buffer());
         auto cb_out0_id = CreateCircularBuffer( program, all_cores, c_out0_config );
         // k sharded
-        auto c_out1_config = CircularBufferConfig(k_size, {{CB::c_out1, kv_data_format}})
-            .set_page_size(CB::c_out1, single_tile_size).set_globally_allocated_address(*output[1].buffer());
+        auto c_out1_config = CircularBufferConfig(k_size, {{CBIndex::c_17, kv_data_format}})
+            .set_page_size(CBIndex::c_17, single_tile_size).set_globally_allocated_address(*output[1].buffer());
         auto cb_out1_id = CreateCircularBuffer( program, all_cores, c_out1_config );
         // v sharded
-        auto c_out2_config = CircularBufferConfig(v_size, {{CB::c_out2, kv_data_format}})
-            .set_page_size(CB::c_out2, single_tile_size).set_globally_allocated_address(*output[2].buffer());
+        auto c_out2_config = CircularBufferConfig(v_size, {{CBIndex::c_18, kv_data_format}})
+            .set_page_size(CBIndex::c_18, single_tile_size).set_globally_allocated_address(*output[2].buffer());
         auto cb_out2_id = CreateCircularBuffer( program, all_cores, c_out2_config );
 
         if (transpose_k) {
-            auto c_im0_config = CircularBufferConfig(k_size, {{CB::c_intermed0, kv_data_format}})
-                .set_page_size(CB::c_intermed0, single_tile_size);
+            auto c_im0_config = CircularBufferConfig(k_size, {{CBIndex::c_24, kv_data_format}})
+                .set_page_size(CBIndex::c_24, single_tile_size);
             auto cb_im0_id = CreateCircularBuffer(program, all_cores, c_im0_config);
         }
 
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/kernels/reader_create_qkv_heads_sharded_separate.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/kernels/reader_create_qkv_heads_sharded_separate.cpp
index 0e2719ae9fc..a632543a638 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/kernels/reader_create_qkv_heads_sharded_separate.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/create_qkv_heads_from_separate_tensors/device/kernels/reader_create_qkv_heads_sharded_separate.cpp
@@ -14,16 +14,16 @@ void kernel_main() {
     constexpr uint32_t k_num_heads_per_core      = get_compile_time_arg_val(5);
     constexpr uint32_t tiles_per_head            = get_compile_time_arg_val(6); // size of a K head `` ``
 
-    constexpr uint32_t cb_inq  = tt::CB::c_in0;
-    constexpr uint32_t cb_inkv = tt::CB::c_in1;
+    constexpr uint32_t cb_inq  = tt::CBIndex::c_0;
+    constexpr uint32_t cb_inkv = tt::CBIndex::c_1;
 
-    constexpr uint32_t cb_outq = tt::CB::c_out0;
+    constexpr uint32_t cb_outq = tt::CBIndex::c_16;
     #ifdef TRANSPOSE_K_HEADS
-    constexpr uint32_t cb_outk = tt::CB::c_intermed0;
+    constexpr uint32_t cb_outk = tt::CBIndex::c_24;
     #else
-    constexpr uint32_t cb_outk = tt::CB::c_out1;
+    constexpr uint32_t cb_outk = tt::CBIndex::c_17;
     #endif
-    constexpr uint32_t cb_outv = tt::CB::c_out2;
+    constexpr uint32_t cb_outv = tt::CBIndex::c_18;
 
 
     // copy one entire head_dim tile, then go to next sequence tile and do another head_dim.
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.cpp
index c3115d93f5a..690f185e049 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.cpp
@@ -4,6 +4,8 @@
 
 #include "nlp_concat_heads_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 // Generic NLP ConcatHeads op
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
index 4f83def6d87..490b7385ec6 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_program_factory.cpp
@@ -8,6 +8,8 @@
 #include "nlp_concat_heads_device_operation.hpp"
 #include "tt_metal/common/work_split.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 using namespace tt::constants;
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp
index 6e5993c4d7e..8536b667806 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_device_operation.cpp
@@ -5,6 +5,8 @@
 #include "nlp_concat_heads_decode_device_operation.hpp"
 #include "tt_metal/common/work_split.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 // NLP ConcatHeads op for decode
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp
index 9d1c862792c..89a5b84d75c 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads_decode/device/nlp_concat_heads_decode_program_factory.cpp
@@ -10,8 +10,9 @@
 
 namespace ttnn::operations::experimental::transformer {
 
-using namespace tt::constants;
 using namespace tt;
+using namespace tt::constants;
+using namespace tt::tt_metal;
 
 operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode(const Tensor &input_tensor, Tensor& output, CoreCoord compute_with_storage_grid_size) {
 
@@ -39,7 +40,7 @@ operation::ProgramWithCallbacks multi_core_nlp_concat_heads_decode(const Tensor
     auto in_cores = in_shard_spec.grid;
     auto in_num_tiles = in_shard_spec.shape[0] * in_shard_spec.shape[1] / TILE_HW;
 
-    uint32_t q_output_cb_index = CB::c_out0;
+    uint32_t q_output_cb_index = CBIndex::c_16;
     tt_metal::CircularBufferConfig cb_q_output_config =
         tt_metal::CircularBufferConfig(
             q_num_tiles * single_tile_size, {{q_output_cb_index, cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp
index b6492205502..20b348cad33 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/nlp_create_qkv_heads_program_factory.cpp
@@ -358,7 +358,7 @@ NlpCreateHeadsDeviceOperation::Sharded::cached_program_t NlpCreateHeadsDeviceOpe
     uint32_t per_risc1_out_q_heads = per_core_out_q_heads / 2;
     uint32_t per_core_in_q_heads = num_q_heads / input_tensor.shard_spec().value().num_cores();
 
-    uint32_t q_output_cb_index = CB::c_out0;
+    uint32_t q_output_cb_index = CBIndex::c_16;
     tt_metal::CircularBufferConfig cb_q_output_config =
         tt_metal::CircularBufferConfig(
             q_num_tiles * single_tile_size, {{q_output_cb_index, cb_data_format}})
@@ -369,7 +369,7 @@ NlpCreateHeadsDeviceOperation::Sharded::cached_program_t NlpCreateHeadsDeviceOpe
     auto k_cores = k_shard_spec.grid;
     auto k_num_tiles = k_shard_spec.shape[0] * k_shard_spec.shape[1] / TILE_HW;
 
-    uint32_t k_output_cb_index = CB::c_out1;
+    uint32_t k_output_cb_index = CBIndex::c_17;
     tt_metal::CircularBufferConfig cb_k_output_config =
         tt_metal::CircularBufferConfig(
             k_num_tiles * single_tile_size, {{k_output_cb_index, cb_data_format}})
@@ -380,7 +380,7 @@ NlpCreateHeadsDeviceOperation::Sharded::cached_program_t NlpCreateHeadsDeviceOpe
     auto v_cores = q_shard_spec.grid;
     auto v_num_tiles = v_shard_spec.shape[0] * v_shard_spec.shape[1] / TILE_HW;
 
-    uint32_t v_output_cb_index = CB::c_out2;
+    uint32_t v_output_cb_index = CBIndex::c_18;
     tt_metal::CircularBufferConfig cb_v_output_config =
         tt_metal::CircularBufferConfig(
             v_num_tiles * single_tile_size, {{v_output_cb_index, cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp
index d4d751f60d9..81339415aa0 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_decode/device/nlp_create_qkv_heads_decode_program_factory.cpp
@@ -47,7 +47,7 @@ namespace ttnn::operations::experimental::transformer {
         auto in_shape = input_tensor.get_legacy_shape();
         auto in_num_tiles = in_shape[-2] * in_shape[-1] / TILE_HW;
 
-        uint32_t q_output_cb_index = CB::c_out0;
+        uint32_t q_output_cb_index = CBIndex::c_16;
         tt_metal::CircularBufferConfig cb_q_output_config =
             tt_metal::CircularBufferConfig(
                 q_num_tiles * single_tile_size, {{q_output_cb_index, cb_data_format}})
@@ -58,7 +58,7 @@ namespace ttnn::operations::experimental::transformer {
         auto k_cores = k_shard_spec.grid;
         auto k_num_tiles = k_shard_spec.shape[0] * k_shard_spec.shape[1] / TILE_HW;
 
-        uint32_t k_output_cb_index = CB::c_out1;
+        uint32_t k_output_cb_index = CBIndex::c_17;
         tt_metal::CircularBufferConfig cb_k_output_config =
             tt_metal::CircularBufferConfig(
                 k_num_tiles * single_tile_size, {{k_output_cb_index, cb_data_format}})
@@ -69,7 +69,7 @@ namespace ttnn::operations::experimental::transformer {
         auto v_cores = q_shard_spec.grid;
         auto v_num_tiles = v_shard_spec.shape[0] * v_shard_spec.shape[1] / TILE_HW;
 
-        uint32_t v_output_cb_index = CB::c_out2;
+        uint32_t v_output_cb_index = CBIndex::c_18;
         tt_metal::CircularBufferConfig cb_v_output_config =
             tt_metal::CircularBufferConfig(
                 v_num_tiles * single_tile_size, {{v_output_cb_index, cb_data_format}})
@@ -198,7 +198,7 @@ namespace ttnn::operations::experimental::transformer {
         auto in_cores = in_shard_spec.grid;
         auto in_num_tiles = in_shard_spec.shape[0] * in_shard_spec.shape[1] / TILE_HW;
 
-        uint32_t q_output_cb_index = CB::c_out0;
+        uint32_t q_output_cb_index = CBIndex::c_16;
         tt_metal::CircularBufferConfig cb_q_output_config =
             tt_metal::CircularBufferConfig(
                 q_num_tiles * single_tile_size, {{q_output_cb_index, cb_data_format}})
@@ -209,7 +209,7 @@ namespace ttnn::operations::experimental::transformer {
         auto k_cores = k_shard_spec.grid;
         auto k_num_tiles = k_shard_spec.shape[0] * k_shard_spec.shape[1] / TILE_HW;
 
-        uint32_t k_output_cb_index = CB::c_out1;
+        uint32_t k_output_cb_index = CBIndex::c_17;
         tt_metal::CircularBufferConfig cb_k_output_config =
             tt_metal::CircularBufferConfig(
                 k_num_tiles * single_tile_size, {{k_output_cb_index, cb_data_format}})
@@ -220,7 +220,7 @@ namespace ttnn::operations::experimental::transformer {
         auto v_cores = q_shard_spec.grid;
         auto v_num_tiles = v_shard_spec.shape[0] * v_shard_spec.shape[1] / TILE_HW;
 
-        uint32_t v_output_cb_index = CB::c_out2;
+        uint32_t v_output_cb_index = CBIndex::c_18;
         tt_metal::CircularBufferConfig cb_v_output_config =
             tt_metal::CircularBufferConfig(
                 v_num_tiles * single_tile_size, {{v_output_cb_index, cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp
index 052e583eb41..ac1131b4bb5 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_falcon7b/device/nlp_create_qkv_heads_falcon7b_device_operation.cpp
@@ -6,6 +6,8 @@
 
 #include "tt_metal/common/work_split.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 // Hard-coded for Falcon7B
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
index c3dcffdc410..bb608499c4d 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_segformer/device/nlp_create_qkv_heads_segformer_device_operation.cpp
@@ -6,6 +6,8 @@
 
 #include "tt_metal/common/work_split.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 // Hard-coded for Segformer
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
index f528f4bf966..992709edd6d 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads_vit/device/nlp_create_qkv_heads_vit_device_operation.cpp
@@ -6,6 +6,8 @@
 
 #include "tt_metal/common/work_split.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 // Hard-coded for Vit
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp
index 40fc256a047..c19ec81a43c 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_device_operation.cpp
@@ -5,6 +5,8 @@
 #include "nlp_kv_cache_load_slice_device_operation.hpp"
 #include "tt_metal/common/work_split.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 // NLP KV Cache Unpad To Sharded op
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
index c4a346f3e61..f8f83d3d438 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/nlp_kv_cache_load_slice/device/nlp_kv_cache_load_slice_program_factory.cpp
@@ -9,6 +9,8 @@
 #include "tt_metal/common/work_split.hpp"
 #include "ttnn/operations/data_movement/slice/device/slice_op.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 using namespace tt::constants;
@@ -75,7 +77,7 @@ operation::ProgramWithCallbacks multi_core_nlp_kv_cache_load_slice(
     tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
     uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format);
 
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     uint32_t num_input_tiles = num_tiles_per_core;
     tt_metal::CircularBufferConfig cb_src0_config =
         tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
index 50a9012ceb7..1b8260949f5 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/device/rotary_embedding_program_factory.cpp
@@ -96,7 +96,7 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
         num_output_tiles = num_input_tiles;
     }
 
-    uint32_t input_cb_index = CB::c_in0;
+    uint32_t input_cb_index = CBIndex::c_0;
     tt_metal::CircularBufferConfig cb_input_config =
         tt_metal::CircularBufferConfig(
             num_input_tiles * input_single_tile_size, {{input_cb_index, input_cb_data_format}})
@@ -106,7 +106,7 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
     }
     auto cb_input = tt_metal::CreateCircularBuffer(program, all_cores, cb_input_config);
 
-    uint32_t rotated_input_cb_index = CB::c_in1;
+    uint32_t rotated_input_cb_index = CBIndex::c_1;
     uint32_t num_rotated_input_tiles = 2 * Wt;
     tt_metal::CircularBufferConfig cb_rotated_input_config =
         tt_metal::CircularBufferConfig(
@@ -115,20 +115,20 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
     auto cb_rotated_input = tt_metal::CreateCircularBuffer(program, all_cores, cb_rotated_input_config);
 
     uint32_t num_cos_sin_tiles = token_idx.has_value() ? Wt : 2 * Wt;
-    uint32_t cos_cb_index = CB::c_in2;
+    uint32_t cos_cb_index = CBIndex::c_2;
     tt_metal::CircularBufferConfig cb_cos_config =
         tt_metal::CircularBufferConfig(num_cos_sin_tiles * cos_single_tile_size, {{cos_cb_index, cos_cb_data_format}})
             .set_page_size(cos_cb_index, cos_single_tile_size);
     auto cb_cos = tt_metal::CreateCircularBuffer(program, all_cores, cb_cos_config);
 
-    uint32_t sin_cb_index = CB::c_in3;
+    uint32_t sin_cb_index = CBIndex::c_3;
     tt_metal::CircularBufferConfig cb_sin_config =
         tt_metal::CircularBufferConfig(num_cos_sin_tiles * sin_single_tile_size, {{sin_cb_index, sin_cb_data_format}})
             .set_page_size(sin_cb_index, sin_single_tile_size);
     auto cb_sin = tt_metal::CreateCircularBuffer(program, all_cores, cb_sin_config);
 
     // Used for bcast scalar
-    uint32_t src_scalar_cb_index = CB::c_in4;
+    uint32_t src_scalar_cb_index = CBIndex::c_4;
     uint32_t num_scalar_tiles = 1;
     tt_metal::CircularBufferConfig cb_src1_config =
         tt_metal::CircularBufferConfig(
@@ -137,28 +137,28 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
 
     uint32_t num_interm_tiles = 1;
-    uint32_t rotated_input_interm_cb_index = CB::c_intermed0;
+    uint32_t rotated_input_interm_cb_index = CBIndex::c_24;
     tt_metal::CircularBufferConfig cb_rotated_input_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * input_single_tile_size, {{rotated_input_interm_cb_index, input_cb_data_format}})
             .set_page_size(rotated_input_interm_cb_index, input_single_tile_size);
     auto cb_rotated_input_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_rotated_input_interm_config);
 
-    uint32_t cos_interm_cb_index = CB::c_intermed1;
+    uint32_t cos_interm_cb_index = CBIndex::c_25;
     tt_metal::CircularBufferConfig cb_cos_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * cos_single_tile_size, {{cos_interm_cb_index, cos_cb_data_format}})
             .set_page_size(cos_interm_cb_index, cos_single_tile_size);
     auto cb_cos_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_cos_interm_config);
 
-    uint32_t sin_interm_cb_index = CB::c_intermed2;
+    uint32_t sin_interm_cb_index = CBIndex::c_26;
     tt_metal::CircularBufferConfig cb_sin_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * sin_single_tile_size, {{sin_interm_cb_index, sin_cb_data_format}})
             .set_page_size(sin_interm_cb_index, sin_single_tile_size);
     auto cb_sin_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_sin_interm_config);
 
-    uint32_t output_cb_index = CB::c_out0;  // output operands start at index 16
+    uint32_t output_cb_index = CBIndex::c_16;  // output operands start at index 16
     tt_metal::CircularBufferConfig cb_output_config =
         tt_metal::CircularBufferConfig(
             num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
@@ -168,12 +168,12 @@ operation::ProgramWithCallbacks rotary_embedding_multi_core(
     }
     auto cb_output = tt_metal::CreateCircularBuffer(program, all_cores, cb_output_config);
 
-    uint32_t untilized_cos_interm_cb_index = CB::c_intermed3;
-    uint32_t untilized_cos_sync_cb_index = CB::c_in5;
-    uint32_t untilized_sin_interm_cb_index = CB::c_intermed4;
-    uint32_t untilized_sin_sync_cb_index = CB::c_in6;
-    uint32_t retilized_cos_cb_index = CB::c_intermed5;
-    uint32_t retilized_sin_cb_index = CB::c_intermed6;
+    uint32_t untilized_cos_interm_cb_index = CBIndex::c_27;
+    uint32_t untilized_cos_sync_cb_index = CBIndex::c_5;
+    uint32_t untilized_sin_interm_cb_index = CBIndex::c_28;
+    uint32_t untilized_sin_sync_cb_index = CBIndex::c_6;
+    uint32_t retilized_cos_cb_index = CBIndex::c_29;
+    uint32_t retilized_sin_cb_index = CBIndex::c_30;
     std::map<string, string> reader_kernel_defines, writer_kernel_defines, compute_kernel_defines;
     if (token_idx.has_value()) {
         tt_metal::CircularBufferConfig cb_cos2_config =
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding.cpp
index f4bad301516..b957e8f9aa4 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding/rotary_embedding.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/operation.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 ttnn::Tensor RotaryEmbeddingOperation::invoke(
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
index 28bafefaf60..b9894a0fe8f 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/device/rotary_embedding_llama_program_factory.cpp
@@ -90,26 +90,26 @@ operation::ProgramWithCallbacks rotary_embedding_llama_multi_core(
     }
 
 
-    uint32_t input_cb_index = CB::c_in0;
+    uint32_t input_cb_index = CBIndex::c_0;
     tt_metal::CircularBufferConfig cb_input_config =
         tt_metal::CircularBufferConfig(
             input_cb_num_tiles * input_single_tile_size, {{input_cb_index, input_cb_data_format}})
             .set_page_size(input_cb_index, input_single_tile_size);
     auto cb_input = tt_metal::CreateCircularBuffer(program, all_cores, cb_input_config);
 
-    uint32_t cos_cb_index = CB::c_in1;
+    uint32_t cos_cb_index = CBIndex::c_1;
     tt_metal::CircularBufferConfig cb_cos_config =
         tt_metal::CircularBufferConfig(num_cos_sin_tiles * cos_single_tile_size, {{cos_cb_index, cos_cb_data_format}})
             .set_page_size(cos_cb_index, cos_single_tile_size);
     auto cb_cos = tt_metal::CreateCircularBuffer(program, all_cores, cb_cos_config);
 
-    uint32_t sin_cb_index = CB::c_in2;
+    uint32_t sin_cb_index = CBIndex::c_2;
     tt_metal::CircularBufferConfig cb_sin_config =
         tt_metal::CircularBufferConfig(num_cos_sin_tiles * sin_single_tile_size, {{sin_cb_index, sin_cb_data_format}})
             .set_page_size(sin_cb_index, sin_single_tile_size);
     auto cb_sin = tt_metal::CreateCircularBuffer(program, all_cores, cb_sin_config);
 
-    uint32_t trans_mat_cb_index = CB::c_in3;
+    uint32_t trans_mat_cb_index = CBIndex::c_3;
     // We only take one tile of trans_mat
     uint32_t num_trans_mat_tiles = 1;
     tt_metal::CircularBufferConfig cb_trans_mat_config =
@@ -118,28 +118,28 @@ operation::ProgramWithCallbacks rotary_embedding_llama_multi_core(
     auto cb_trans_mat = tt_metal::CreateCircularBuffer(program, all_cores, cb_trans_mat_config);
 
     uint32_t num_interm_tiles = head_dim_t;
-    uint32_t rotated_input_interm_cb_index = CB::c_intermed0;
+    uint32_t rotated_input_interm_cb_index = CBIndex::c_24;
     tt_metal::CircularBufferConfig cb_rotated_input_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * input_single_tile_size, {{rotated_input_interm_cb_index, input_cb_data_format}})
             .set_page_size(rotated_input_interm_cb_index, input_single_tile_size);
     auto cb_rotated_input_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_rotated_input_interm_config);
 
-    uint32_t cos_interm_cb_index = CB::c_intermed1;
+    uint32_t cos_interm_cb_index = CBIndex::c_25;
     tt_metal::CircularBufferConfig cb_cos_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * cos_single_tile_size, {{cos_interm_cb_index, cos_cb_data_format}})
             .set_page_size(cos_interm_cb_index, cos_single_tile_size);
     auto cb_cos_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_cos_interm_config);
 
-    uint32_t sin_interm_cb_index = CB::c_intermed2;
+    uint32_t sin_interm_cb_index = CBIndex::c_26;
     tt_metal::CircularBufferConfig cb_sin_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * sin_single_tile_size, {{sin_interm_cb_index, sin_cb_data_format}})
             .set_page_size(sin_interm_cb_index, sin_single_tile_size);
     auto cb_sin_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_sin_interm_config);
 
-    uint32_t output_cb_index = CB::c_out0;  // output operands start at index 16
+    uint32_t output_cb_index = CBIndex::c_16;  // output operands start at index 16
     tt_metal::CircularBufferConfig cb_output_config =
         tt_metal::CircularBufferConfig(
             num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
@@ -405,7 +405,7 @@ operation::ProgramWithCallbacks rotary_embedding_llama_multi_core_sharded(
     auto trans_mat_buffer = trans_mat.buffer();
     auto dst_buffer = output.buffer();
 
-    uint32_t input_cb_index = CB::c_in0;
+    uint32_t input_cb_index = CBIndex::c_0;
     tt_metal::CircularBufferConfig cb_input_config =
         tt_metal::CircularBufferConfig(
             num_input_tiles * input_single_tile_size, {{input_cb_index, input_cb_data_format}})
@@ -413,21 +413,21 @@ operation::ProgramWithCallbacks rotary_embedding_llama_multi_core_sharded(
             .set_globally_allocated_address(*src_buffer);
     auto cb_input = tt_metal::CreateCircularBuffer(program, all_cores, cb_input_config);
 
-    uint32_t cos_cb_index = CB::c_in1;
+    uint32_t cos_cb_index = CBIndex::c_1;
     tt_metal::CircularBufferConfig cb_cos_config =
         tt_metal::CircularBufferConfig(num_cos_sin_tiles * cos_single_tile_size, {{cos_cb_index, cos_cb_data_format}})
             .set_page_size(cos_cb_index, cos_single_tile_size)
             .set_globally_allocated_address(*cos_buffer);
     auto cb_cos = tt_metal::CreateCircularBuffer(program, all_cores, cb_cos_config);
 
-    uint32_t sin_cb_index = CB::c_in2;
+    uint32_t sin_cb_index = CBIndex::c_2;
     tt_metal::CircularBufferConfig cb_sin_config =
         tt_metal::CircularBufferConfig(num_cos_sin_tiles * sin_single_tile_size, {{sin_cb_index, sin_cb_data_format}})
             .set_page_size(sin_cb_index, sin_single_tile_size)
             .set_globally_allocated_address(*sin_buffer);
     auto cb_sin = tt_metal::CreateCircularBuffer(program, all_cores, cb_sin_config);
 
-    uint32_t trans_mat_cb_index = CB::c_in3;
+    uint32_t trans_mat_cb_index = CBIndex::c_3;
     // We only take one tile of trans_mat
     uint32_t num_trans_mat_tiles = 1;
     tt_metal::CircularBufferConfig cb_trans_mat_config =
@@ -437,28 +437,28 @@ operation::ProgramWithCallbacks rotary_embedding_llama_multi_core_sharded(
     auto cb_trans_mat = tt_metal::CreateCircularBuffer(program, all_cores, cb_trans_mat_config);
 
     uint32_t num_interm_tiles = head_dim_t;
-    uint32_t rotated_input_interm_cb_index = CB::c_intermed0;
+    uint32_t rotated_input_interm_cb_index = CBIndex::c_24;
     tt_metal::CircularBufferConfig cb_rotated_input_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * input_single_tile_size, {{rotated_input_interm_cb_index, input_cb_data_format}})
             .set_page_size(rotated_input_interm_cb_index, input_single_tile_size);
     auto cb_rotated_input_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_rotated_input_interm_config);
 
-    uint32_t cos_interm_cb_index = CB::c_intermed1;
+    uint32_t cos_interm_cb_index = CBIndex::c_25;
     tt_metal::CircularBufferConfig cb_cos_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * input_single_tile_size, {{cos_interm_cb_index, cos_cb_data_format}})
             .set_page_size(cos_interm_cb_index, cos_single_tile_size);
     auto cb_cos_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_cos_interm_config);
 
-    uint32_t sin_interm_cb_index = CB::c_intermed2;
+    uint32_t sin_interm_cb_index = CBIndex::c_26;
     tt_metal::CircularBufferConfig cb_sin_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * input_single_tile_size, {{sin_interm_cb_index, sin_cb_data_format}})
             .set_page_size(sin_interm_cb_index, sin_single_tile_size);
     auto cb_sin_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_sin_interm_config);
 
-    uint32_t output_cb_index = CB::c_out0;  // output operands start at index 16
+    uint32_t output_cb_index = CBIndex::c_16;  // output operands start at index 16
     tt_metal::CircularBufferConfig cb_output_config =
         tt_metal::CircularBufferConfig(
             num_output_tiles * output_single_tile_size, {{output_cb_index, output_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama.cpp
index 0ac240064c5..6fc5a6e8d39 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama/rotary_embedding_llama.cpp
@@ -6,6 +6,8 @@
 
 #include "device/rotary_embedding_llama_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 Tensor RotaryEmbeddingLlamaOperation::invoke(
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.cpp
index 7e5d610176c..211998991f0 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/device/rotary_embedding_llama_fused_qk_program_factory.cpp
@@ -93,7 +93,7 @@ operation::ProgramWithCallbacks rotary_embedding_llama_fused_qk_multi_core_shard
     auto q_dst_buffer = q_output.buffer();
     auto k_dst_buffer = k_output.buffer();
 
-    uint32_t q_input_cb_index = CB::c_in0;
+    uint32_t q_input_cb_index = CBIndex::c_0;
     tt_metal::CircularBufferConfig cb_q_input_config =
         tt_metal::CircularBufferConfig(
             num_q_input_tiles * input_single_tile_size, {{q_input_cb_index, input_cb_data_format}})
@@ -101,7 +101,7 @@ operation::ProgramWithCallbacks rotary_embedding_llama_fused_qk_multi_core_shard
             .set_globally_allocated_address(*q_src_buffer);
     auto cb_q_input = tt_metal::CreateCircularBuffer(program, q_cores, cb_q_input_config);
 
-    uint32_t k_input_cb_index = CB::c_in1;
+    uint32_t k_input_cb_index = CBIndex::c_1;
     tt_metal::CircularBufferConfig cb_k_input_config =
         tt_metal::CircularBufferConfig(
             num_k_input_tiles * input_single_tile_size, {{k_input_cb_index, input_cb_data_format}})
@@ -109,21 +109,21 @@ operation::ProgramWithCallbacks rotary_embedding_llama_fused_qk_multi_core_shard
             .set_globally_allocated_address(*k_src_buffer);
     auto cb_k_input = tt_metal::CreateCircularBuffer(program, k_cores, cb_k_input_config);
 
-    uint32_t cos_cb_index = CB::c_in2;
+    uint32_t cos_cb_index = CBIndex::c_2;
     tt_metal::CircularBufferConfig cb_cos_config =
         tt_metal::CircularBufferConfig(num_cos_sin_tiles * cos_single_tile_size, {{cos_cb_index, cos_cb_data_format}})
             .set_page_size(cos_cb_index, cos_single_tile_size)
             .set_globally_allocated_address(*cos_buffer);
     auto cb_cos = tt_metal::CreateCircularBuffer(program, all_cores, cb_cos_config);
 
-    uint32_t sin_cb_index = CB::c_in3;
+    uint32_t sin_cb_index = CBIndex::c_3;
     tt_metal::CircularBufferConfig cb_sin_config =
         tt_metal::CircularBufferConfig(num_cos_sin_tiles * sin_single_tile_size, {{sin_cb_index, sin_cb_data_format}})
             .set_page_size(sin_cb_index, sin_single_tile_size)
             .set_globally_allocated_address(*sin_buffer);
     auto cb_sin = tt_metal::CreateCircularBuffer(program, all_cores, cb_sin_config);
 
-    uint32_t trans_mat_cb_index = CB::c_in4;
+    uint32_t trans_mat_cb_index = CBIndex::c_4;
     // We only take one tile of trans_mat
     uint32_t num_trans_mat_tiles = 1;
     tt_metal::CircularBufferConfig cb_trans_mat_config =
@@ -133,35 +133,35 @@ operation::ProgramWithCallbacks rotary_embedding_llama_fused_qk_multi_core_shard
     auto cb_trans_mat = tt_metal::CreateCircularBuffer(program, all_cores, cb_trans_mat_config);
 
     uint32_t num_interm_tiles = head_dim_t;
-    uint32_t rotated_input_interm_cb_index = CB::c_intermed0;
+    uint32_t rotated_input_interm_cb_index = CBIndex::c_24;
     tt_metal::CircularBufferConfig cb_rotated_input_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * input_single_tile_size, {{rotated_input_interm_cb_index, input_cb_data_format}})
             .set_page_size(rotated_input_interm_cb_index, input_single_tile_size);
     auto cb_rotated_input_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_rotated_input_interm_config);
 
-    uint32_t cos_interm_cb_index = CB::c_intermed1;
+    uint32_t cos_interm_cb_index = CBIndex::c_25;
     tt_metal::CircularBufferConfig cb_cos_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * input_single_tile_size, {{cos_interm_cb_index, cos_cb_data_format}})
             .set_page_size(cos_interm_cb_index, cos_single_tile_size);
     auto cb_cos_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_cos_interm_config);
 
-    uint32_t sin_interm_cb_index = CB::c_intermed2;
+    uint32_t sin_interm_cb_index = CBIndex::c_26;
     tt_metal::CircularBufferConfig cb_sin_interm_config =
         tt_metal::CircularBufferConfig(
             num_interm_tiles * input_single_tile_size, {{sin_interm_cb_index, sin_cb_data_format}})
             .set_page_size(sin_interm_cb_index, sin_single_tile_size);
     auto cb_sin_interm = tt_metal::CreateCircularBuffer(program, all_cores, cb_sin_interm_config);
 
-    uint32_t q_output_cb_index = CB::c_out0;  // output operands start at index 16
+    uint32_t q_output_cb_index = CBIndex::c_16;  // output operands start at index 16
     tt_metal::CircularBufferConfig cb_q_output_config =
         tt_metal::CircularBufferConfig(
             num_q_output_tiles * output_single_tile_size, {{q_output_cb_index, output_cb_data_format}})
             .set_page_size(q_output_cb_index, output_single_tile_size)
             .set_globally_allocated_address(*q_dst_buffer);
     auto cb_q_output = tt_metal::CreateCircularBuffer(program, q_cores, cb_q_output_config);
-    uint32_t k_output_cb_index = CB::c_out1;  // output operands start at index 17
+    uint32_t k_output_cb_index = CBIndex::c_17;  // output operands start at index 17
     tt_metal::CircularBufferConfig cb_k_output_config =
         tt_metal::CircularBufferConfig(
             num_k_output_tiles * output_single_tile_size, {{k_output_cb_index, output_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk.cpp
index ab32489933e..07f3b388a31 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotary_embedding_llama_fused_qk/rotary_embedding_llama_fused_qk.cpp
@@ -6,6 +6,8 @@
 
 #include "device/rotary_embedding_llama_fused_qk_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 std::tuple<ttnn::Tensor, ttnn::Tensor> RotaryEmbeddingLlamaFusedQKOperation::invoke(
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/rotate_half_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/rotate_half_device_operation.cpp
index d1dcecf3a44..33ef8595400 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/rotate_half_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/rotate_half_device_operation.cpp
@@ -11,6 +11,7 @@
 namespace ttnn::operations::experimental::transformer {
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 void RotateHalf::validate(const std::vector<Tensor>& input_tensors) const {
     const auto& input_tensor = input_tensors.at(0);
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.cpp
index 6e93c51af21..4c1725e2b74 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/device/single_core/rotate_half_program_factory.cpp
@@ -8,6 +8,8 @@
 #include "tt_metal/common/constants.hpp"
 #include "tt_metal/detail/util.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer::detail {
 
 using namespace tt;
@@ -50,7 +52,7 @@ operation::ProgramWithCallbacks rotate_half_single_core(const Tensor &input, Ten
 		.set_page_size(src_no_mul_cb_index, single_tile_size);
     auto cb_src_no_mul = tt_metal::CreateCircularBuffer(program, core, cb_src_no_mul_config);
 
-    uint32_t output_mul_cb_index = 16; // output operands start at index 16
+    uint32_t output_mul_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_mul_cb_index, cb_data_format}})
 		.set_page_size(output_mul_cb_index, single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half.cpp
index d775d210201..c25d7fe7ffe 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/rotate_half/rotate_half.cpp
@@ -6,6 +6,8 @@
 
 #include "device/rotate_half_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 Tensor RotateHalfOperation::invoke(const Tensor& input_tensor, const std::optional<MemoryConfig>& memory_config) {
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/compute/transpose_wh_sharded.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/compute/transpose_wh_sharded.cpp
index 4a92f2a7e1d..983e22d79da 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/compute/transpose_wh_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/compute/transpose_wh_sharded.cpp
@@ -11,10 +11,10 @@ void MAIN {
 
     uint32_t num_tiles = get_compile_time_arg_val(0);
 
-    transpose_wh_init(tt::CB::c_intermed0);
+    transpose_wh_init(tt::CBIndex::c_24);
 
-    constexpr uint32_t cb_im0 = tt::CB::c_intermed0;
-    constexpr uint32_t cb_out1 = tt::CB::c_out1;
+    constexpr uint32_t cb_im0 = tt::CBIndex::c_24;
+    constexpr uint32_t cb_out1 = tt::CBIndex::c_17;
 
     // transpose a row-major block:
     // - assumes the tiles come in in column major order from reader
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/dataflow/reader_tm_tile_layout_create_qkv_heads_sharded.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/dataflow/reader_tm_tile_layout_create_qkv_heads_sharded.cpp
index cfab2446f58..bce0c91ddb6 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/dataflow/reader_tm_tile_layout_create_qkv_heads_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/dataflow/reader_tm_tile_layout_create_qkv_heads_sharded.cpp
@@ -16,11 +16,11 @@ void kernel_main() {
     constexpr uint32_t tensor_stride_size_bytes               = get_compile_time_arg_val(7);
 
 
-    constexpr uint32_t cb_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_im0 = tt::CB::c_intermed0;
-    constexpr uint32_t cb_out0 = tt::CB::c_out0;
-    constexpr uint32_t cb_out1 = tt::CB::c_out1;
-    constexpr uint32_t cb_out2 = tt::CB::c_out2;
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_im0 = tt::CBIndex::c_24;
+    constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
+    constexpr uint32_t cb_out1 = tt::CBIndex::c_17;
+    constexpr uint32_t cb_out2 = tt::CBIndex::c_18;
 
     const uint32_t single_tile_size_bytes = get_tile_size(cb_in0);
     const DataFormat data_format = get_dataformat(cb_in0);
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/dataflow/writer_tm_tile_layout_create_qkv_heads_sharded.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/dataflow/writer_tm_tile_layout_create_qkv_heads_sharded.cpp
index 41f3d69dde2..1147ec326cc 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/dataflow/writer_tm_tile_layout_create_qkv_heads_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/kernels/dataflow/writer_tm_tile_layout_create_qkv_heads_sharded.cpp
@@ -13,11 +13,11 @@ void kernel_main() {
     constexpr uint32_t tensor_stride_size_bytes               = get_compile_time_arg_val(7);
 
 
-    constexpr uint32_t cb_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_im0 = tt::CB::c_intermed0;
-    constexpr uint32_t cb_out0 = tt::CB::c_out0;
-    constexpr uint32_t cb_out1 = tt::CB::c_out1;
-    constexpr uint32_t cb_out2 = tt::CB::c_out2;
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_im0 = tt::CBIndex::c_24;
+    constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
+    constexpr uint32_t cb_out1 = tt::CBIndex::c_17;
+    constexpr uint32_t cb_out2 = tt::CBIndex::c_18;
 
     const uint32_t single_tile_size_bytes = get_tile_size(cb_in0);
     const DataFormat data_format = get_dataformat(cb_in0);
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.cpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.cpp
index dad2e4ce8d0..f7b4452ff29 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_device_operation.cpp
@@ -6,6 +6,8 @@
 
 #include "split_query_key_value_and_split_heads_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::experimental::transformer {
 
 void SplitFusedQKVAndSplitHeadsDeviceOperation::validate_with_output_tensors(const std::vector<Tensor>& input_tensors,
diff --git a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp
index 057841dd619..88688310e63 100644
--- a/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/experimental/transformer/split_query_key_value_and_split_heads/device/split_query_key_value_and_split_heads_program_factory.hpp
@@ -315,24 +315,24 @@ operation::ProgramWithCallbacks multi_core_split_query_key_value_and_split_heads
 
     // Create circular buffers
     // in0 sharded
-    auto c_in0_config = CircularBufferConfig(in0_CB_size, {{CB::c_in0, cb_data_format}})
-        .set_page_size(CB::c_in0, single_tile_size).set_globally_allocated_address(*a.buffer());
+    auto c_in0_config = CircularBufferConfig(in0_CB_size, {{CBIndex::c_0, cb_data_format}})
+        .set_page_size(CBIndex::c_0, single_tile_size).set_globally_allocated_address(*a.buffer());
     auto cb_in0_id = CreateCircularBuffer(program, all_cores, c_in0_config);
     // im
-    auto c_im0_config = CircularBufferConfig(im0_CB_size, {{CB::c_intermed0, cb_data_format}})
-        .set_page_size(CB::c_intermed0, single_tile_size);
+    auto c_im0_config = CircularBufferConfig(im0_CB_size, {{CBIndex::c_24, cb_data_format}})
+        .set_page_size(CBIndex::c_24, single_tile_size);
     auto cb_im0_id = CreateCircularBuffer(program, all_cores, c_im0_config);
     // q sharded
-    auto c_out0_config = CircularBufferConfig(out_CB_size, {{CB::c_out0, cb_data_format}})
-        .set_page_size(CB::c_out0, single_tile_size).set_globally_allocated_address(*output[0].buffer());;
+    auto c_out0_config = CircularBufferConfig(out_CB_size, {{CBIndex::c_16, cb_data_format}})
+        .set_page_size(CBIndex::c_16, single_tile_size).set_globally_allocated_address(*output[0].buffer());;
     auto cb_out0_id = CreateCircularBuffer( program, all_cores, c_out0_config );
     // k sharded
-    auto c_out1_config = CircularBufferConfig(out_CB_size, {{CB::c_out1, cb_data_format}})
-        .set_page_size(CB::c_out1, single_tile_size).set_globally_allocated_address(*output[1].buffer());;
+    auto c_out1_config = CircularBufferConfig(out_CB_size, {{CBIndex::c_17, cb_data_format}})
+        .set_page_size(CBIndex::c_17, single_tile_size).set_globally_allocated_address(*output[1].buffer());;
     auto cb_out1_id = CreateCircularBuffer( program, all_cores, c_out1_config );
     // v sharded
-    auto c_out2_config = CircularBufferConfig(out_CB_size, {{CB::c_out2, cb_data_format}})
-        .set_page_size(CB::c_out2, single_tile_size).set_globally_allocated_address(*output[2].buffer());;
+    auto c_out2_config = CircularBufferConfig(out_CB_size, {{CBIndex::c_18, cb_data_format}})
+        .set_page_size(CBIndex::c_18, single_tile_size).set_globally_allocated_address(*output[2].buffer());;
     auto cb_out2_id = CreateCircularBuffer( program, all_cores, c_out2_config );
 
     auto override_runtime_args_callback = [
diff --git a/ttnn/cpp/ttnn/operations/full/device/full_program_factory.cpp b/ttnn/cpp/ttnn/operations/full/device/full_program_factory.cpp
index 30285085191..35ffd1f73d6 100644
--- a/ttnn/cpp/ttnn/operations/full/device/full_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/full/device/full_program_factory.cpp
@@ -39,7 +39,7 @@ FullOperation::ProgramFactory::cached_program_t FullOperation::ProgramFactory::c
     Program program = Program();
 
     // Create circular buffer
-    auto cb_index = tt::CB::c_intermed0;
+    auto cb_index = tt::CBIndex::c_24;
     CreateCircularBuffer(
         program,
         all_cores,
diff --git a/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp b/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp
index 3c8c10b9a1e..4f004fb3228 100644
--- a/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/full_like/device/full_like_factory.cpp
@@ -53,7 +53,7 @@ FullLikeOperation::ProgramFactory::cached_program_t FullLikeOperation::ProgramFa
     auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
         tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tiles);
 
-    constexpr CB cb_fill_value_id = CB::c_intermed0;
+    constexpr CBIndex cb_fill_value_id = CBIndex::c_24;
 
     CircularBufferConfig cb_value_config = CircularBufferConfig(single_tile_size, {{cb_fill_value_id, data_format}})
                                                .set_page_size(cb_fill_value_id, single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
index 0a3c53b0018..96a80e6f216 100644
--- a/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/index_fill_multi_core_factory.cpp
@@ -69,7 +69,7 @@ IndexFillOperation::MultiCore::cached_program_t IndexFillOperation::MultiCore::c
     uint32_t output_unit_size = output.get_logical_shape()[-1] * output.element_size();
     uint32_t rounded_output_unit_size = round_up_to_mul32(output_unit_size);
 
-    auto src_cb_index = CB::c_in0;
+    auto src_cb_index = CBIndex::c_0;
     CircularBufferConfig cb_src_config =
         CircularBufferConfig(rounded_input_unit_size, {{src_cb_index, input_data_format}})
             .set_page_size(src_cb_index, rounded_input_unit_size);
@@ -85,13 +85,13 @@ IndexFillOperation::MultiCore::cached_program_t IndexFillOperation::MultiCore::c
             break;
     }
 
-    auto index_cb_index = CB::c_in1;
+    auto index_cb_index = CBIndex::c_1;
     CircularBufferConfig cb_index_config =
         CircularBufferConfig(rounded_index_unit_size, {{index_cb_index, index_data_format}})
             .set_page_size(index_cb_index, rounded_index_unit_size);
     auto cb_index = CreateCircularBuffer(program, all_cores, cb_index_config);
 
-    auto dst_cb_index = CB::c_out0;
+    auto dst_cb_index = CBIndex::c_16;
     CircularBufferConfig dst_cb_config =
         CircularBufferConfig(rounded_output_unit_size, {{dst_cb_index, output_data_format}})
             .set_page_size(dst_cb_index, rounded_output_unit_size);
diff --git a/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp
index 3ecfca0c0a7..2fa5d17132d 100644
--- a/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp
+++ b/ttnn/cpp/ttnn/operations/index_fill/device/kernels/writer_index_fill.cpp
@@ -10,8 +10,8 @@ void kernel_main() {
     uint32_t start_id = get_arg_val<uint32_t>(2);
     uint32_t output_unit_size = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t dst_cb_id = tt::CB::c_out0;
-    constexpr uint32_t src_cb_id = tt::CB::c_in0;
+    constexpr uint32_t dst_cb_id = tt::CBIndex::c_16;
+    constexpr uint32_t src_cb_id = tt::CBIndex::c_0;
     constexpr bool output_is_dram = get_compile_time_arg_val(0) == 1;
 
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
index 11a8a1b9bb6..09ce028d9b9 100644
--- a/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/kv_cache/device/update_cache_op_multi_core.cpp
@@ -85,21 +85,21 @@ operation::ProgramWithCallbacks update_cache_multi_core(const Tensor& cache_tens
         std::tie(num_cores, all_cores, core_group_1, core_group_2, num_batched_heads_per_core_group_1, num_batched_heads_per_core_group_2) = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_batched_heads, row_major);
         num_input_tiles = 2 * Wt; // double buffered
     }
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_cache_tiles = 2 * granularity * Wt; // double buffered
     tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(num_cache_tiles * cache_single_tile_size, {{src0_cb_index, cache_cb_data_format}})
 		.set_page_size(src0_cb_index, cache_single_tile_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t src1_cb_index = tt::CB::c_in1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(num_input_tiles * input_single_tile_size, {{src1_cb_index, input_cb_data_format}})
 		.set_page_size(src1_cb_index, input_single_tile_size);
     if (shard_spec.has_value()) {
         cb_src1_config = cb_src1_config.set_globally_allocated_address(*input_tensor.buffer());
     }
     auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
-    uint32_t interm0_cb_index = tt::CB::c_intermed0;
-    uint32_t interm1_cb_index = tt::CB::c_intermed1;
+    uint32_t interm0_cb_index = tt::CBIndex::c_24;
+    uint32_t interm1_cb_index = tt::CBIndex::c_25;
 
     uint32_t num_interm_tiles = 2 * granularity * Wt; // double buffered
     std::map<uint8_t, tt::DataFormat> interim_data_format_spec = {
@@ -111,13 +111,13 @@ operation::ProgramWithCallbacks update_cache_multi_core(const Tensor& cache_tens
         .set_page_size(interm1_cb_index, interm_single_tile_size);
     auto cb_interm0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_interm0_config);
 
-    uint32_t interm2_cb_index = tt::CB::c_intermed2;
+    uint32_t interm2_cb_index = tt::CBIndex::c_26;
     tt::tt_metal::CircularBufferConfig cb_interm2_config = tt::tt_metal::CircularBufferConfig(num_interm_tiles * interm_single_tile_size, {{interm2_cb_index, interm_cb_data_format}})
 		.set_page_size(interm2_cb_index, interm_single_tile_size);
     auto cb_interm2 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_interm2_config);
 
     // Output is same tensor as cache input, so cb/tile size is same
-    uint32_t output_cb_index = tt::CB::c_out0;
+    uint32_t output_cb_index = tt::CBIndex::c_16;
 
     // Must buffer all tiles for a single head
     uint32_t num_output_tiles = B * Wt;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm.cpp
index abdcf8e0ffb..ff49479a0d7 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm.cpp
@@ -33,18 +33,18 @@ void MAIN {
             {
                 acquire_dst();
                 for (uint32_t kt = 0; kt < Kt; kt++) {
-                    cb_wait_front(tt::CB::c_in0, onetile);
-                    cb_wait_front(tt::CB::c_in1, onetile);
+                    cb_wait_front(tt::CBIndex::c_0, onetile);
+                    cb_wait_front(tt::CBIndex::c_1, onetile);
 
-                    matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
+                    matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
 
-                    cb_pop_front(tt::CB::c_in0, onetile);
-                    cb_pop_front(tt::CB::c_in1, onetile);
+                    cb_pop_front(tt::CBIndex::c_0, onetile);
+                    cb_pop_front(tt::CBIndex::c_1, onetile);
                 }
 
-                cb_reserve_back(tt::CB::c_out0, onetile);
-                pack_tile(0, tt::CB::c_out0);
-                cb_push_back(tt::CB::c_out0, onetile);
+                cb_reserve_back(tt::CBIndex::c_16, onetile);
+                pack_tile(0, tt::CBIndex::c_16);
+                cb_push_back(tt::CBIndex::c_16, onetile);
 
                 release_dst();
             }
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
index 93e17cd3182..33b29c04ecb 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm.cpp
@@ -22,7 +22,7 @@ void MAIN {
     uint32_t out_subblock_num_tiles = get_compile_time_arg_val(10);  // out_subblock_h * out_subblock_w;
     uint32_t batch = get_compile_time_arg_val(11);                   // batch dim
 
-    mm_init(tt::CB::c_in0, tt::CB::c_in1, tt::CB::c_intermed0);
+    mm_init(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_24);
 
     for (uint32_t b = 0; b < batch; b++) {
         bool spill = num_blocks > 1;
@@ -32,8 +32,8 @@ void MAIN {
         for (uint32_t block = 0; block < num_blocks; block++) {
             bool last_out = block == (num_blocks - 1);
 
-            cb_wait_front(tt::CB::c_in0, in0_block_num_tiles);
-            cb_wait_front(tt::CB::c_in1, in1_block_num_tiles);
+            cb_wait_front(tt::CBIndex::c_0, in0_block_num_tiles);
+            cb_wait_front(tt::CBIndex::c_1, in1_block_num_tiles);
             int in0_index_subblock_offset = 0;
             for (uint32_t in0_subblock = 0; in0_subblock < in0_num_subblocks; in0_subblock++) {
                 int in1_index_subblock_offset = 0;
@@ -41,13 +41,13 @@ void MAIN {
                     acquire_dst();
 
                     if (enable_reload) {
-                        copy_tile_to_dst_init_short_with_dt(tt::CB::c_in1, tt::CB::c_intermed0);
-                        cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        copy_tile_to_dst_init_short_with_dt(tt::CBIndex::c_1, tt::CBIndex::c_24);
+                        cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            copy_tile(tt::CB::c_intermed0, i, i);
+                            copy_tile(tt::CBIndex::c_24, i, i);
                         }
-                        cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
-                        mm_init_short_with_dt(tt::CB::c_in0, tt::CB::c_in1, tt::CB::c_intermed0);
+                        cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
+                        mm_init_short_with_dt(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_24);
                     }
 
                     // Compute output sub-block from in0_subblock x in1_subblock
@@ -60,8 +60,8 @@ void MAIN {
                                 int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
                                 int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
                                 matmul_tiles(
-                                    tt::CB::c_in0,
-                                    tt::CB::c_in1,
+                                    tt::CBIndex::c_0,
+                                    tt::CBIndex::c_1,
                                     in0_index,
                                     in1_index,
                                     dst_index,
@@ -75,23 +75,23 @@ void MAIN {
 
                     if (last_out) {
                         // Pack out to output buffer
-                        cb_reserve_back(tt::CB::c_out0, out_subblock_num_tiles);
+                        cb_reserve_back(tt::CBIndex::c_16, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            pack_tile(i, tt::CB::c_out0);
+                            pack_tile(i, tt::CBIndex::c_16);
                         }
-                        cb_push_back(tt::CB::c_out0, out_subblock_num_tiles);
+                        cb_push_back(tt::CBIndex::c_16, out_subblock_num_tiles);
                     } else {
                         // Wait for tiles in output buffer to be written out since interm and output share memory
                         if (block == 0) {
-                            cb_reserve_back(tt::CB::c_out0, out_num_tiles_to_wait);
+                            cb_reserve_back(tt::CBIndex::c_16, out_num_tiles_to_wait);
                             out_num_tiles_to_wait += out_subblock_num_tiles;
                         }
                         // Move partial result to interm buffer
-                        cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
                         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                            pack_tile(i, tt::CB::c_intermed0);
+                            pack_tile(i, tt::CBIndex::c_24);
                         }
-                        cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+                        cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
                     }
 
                     release_dst();
@@ -103,8 +103,8 @@ void MAIN {
             if (spill)
                 enable_reload = true;
 
-            cb_pop_front(tt::CB::c_in0, in0_block_num_tiles);
-            cb_pop_front(tt::CB::c_in1, in1_block_num_tiles);
+            cb_pop_front(tt::CBIndex::c_0, in0_block_num_tiles);
+            cb_pop_front(tt::CBIndex::c_1, in1_block_num_tiles);
         }
     }
 }
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
index ca541292bf8..ab5489b4456 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp
@@ -112,15 +112,15 @@ void MAIN {
 
     constexpr uint32_t out_block_w = out_subblock_w * in1_num_subblocks;
 
-    constexpr uint32_t in0_cb_id = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id = tt::CB::c_in1;
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
-    constexpr uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
+    constexpr uint32_t in0_cb_id = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id = tt::CBIndex::c_1;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
+    constexpr uint32_t mm_partials_cb_id = tt::CBIndex::c_24;
 
     constexpr uint32_t untilize_mode_out_cb_id = untilize_out ? mm_partials_cb_id : out_cb_id;
 
 #ifdef FUSE_BIAS
-    constexpr uint32_t bias_cb_id = tt::CB::c_in3;
+    constexpr uint32_t bias_cb_id = tt::CBIndex::c_3;
     constexpr uint32_t mm_out_cb_id = mm_partials_cb_id;
 #else
     constexpr uint32_t mm_out_cb_id = untilize_mode_out_cb_id;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation_inline_untilize_out.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation_inline_untilize_out.cpp
index 8cf757cf218..385fb67184d 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation_inline_untilize_out.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation_inline_untilize_out.cpp
@@ -101,13 +101,13 @@ void MAIN {
 
     constexpr uint32_t out_block_w = out_subblock_w * in1_num_subblocks;
 
-    constexpr uint32_t in0_cb_id = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id = tt::CB::c_in1;
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
-    constexpr uint32_t mm_partials_cb_id = tt::CB::c_intermed0;
+    constexpr uint32_t in0_cb_id = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id = tt::CBIndex::c_1;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
+    constexpr uint32_t mm_partials_cb_id = tt::CBIndex::c_24;
 
 #ifdef FUSE_BIAS
-    constexpr uint32_t bias_cb_id = tt::CB::c_in3;
+    constexpr uint32_t bias_cb_id = tt::CBIndex::c_3;
     constexpr uint32_t mm_out_cb_id = mm_partials_cb_id;
 #else
     constexpr uint32_t mm_out_cb_id = out_cb_id;
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core.cpp
index 8cafe7682ef..98416ce76ff 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core.cpp
@@ -46,8 +46,8 @@ void kernel_main() {
     uint32_t in1_next_block_stride_h = get_arg_val<uint32_t>(20);
     uint32_t in1_next_block_stride_w = get_arg_val<uint32_t>(21);
 
-    constexpr uint32_t in0_cb_id = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id = tt::CB::c_in1;
+    constexpr uint32_t in0_cb_id = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id = tt::CBIndex::c_1;
 
     const uint32_t in0_tile_nbytes = get_tile_size(in0_cb_id);
     const uint32_t in1_tile_nbytes = get_tile_size(in1_cb_id);
@@ -80,7 +80,7 @@ void kernel_main() {
 
 // read bias first if defined
 #ifdef FUSE_BIAS
-    constexpr uint32_t bias_cb_id = tt::CB::c_in2;
+    constexpr uint32_t bias_cb_id = tt::CBIndex::c_2;
     uint32_t bias_addr = get_arg_val<uint32_t>(22);
     uint32_t bias_width_ntiles = get_arg_val<uint32_t>(23);
     uint32_t bias_log2_of_pagesize = get_arg_val<uint32_t>(24);
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core_tilize_untilize.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core_tilize_untilize.cpp
index b75062062e4..4104d682d18 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core_tilize_untilize.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_single_core_tilize_untilize.cpp
@@ -37,8 +37,8 @@ void kernel_main() {
     constexpr uint32_t TILE_HEIGHT = 32;  // TODO (AS): use a common source of truth
     constexpr uint32_t TILE_WIDTH = 32;   // TODO (AS): use a common source of truth
 
-    constexpr uint32_t in0_cb_id = tt::CB::c_in0;
-    constexpr uint32_t in1_cb_id = tt::CB::c_in1;
+    constexpr uint32_t in0_cb_id = tt::CBIndex::c_0;
+    constexpr uint32_t in1_cb_id = tt::CBIndex::c_1;
 
     const DataFormat in0_df = get_dataformat(in0_cb_id);
 
@@ -54,7 +54,7 @@ void kernel_main() {
 
 // read bias first if defined
 #ifdef FUSE_BIAS
-    constexpr uint32_t bias_cb_id = tt::CB::c_in2;
+    constexpr uint32_t bias_cb_id = tt::CBIndex::c_2;
     uint32_t bias_addr = get_arg_val<uint32_t>(22);
     uint32_t bias_width_ntiles = get_arg_val<uint32_t>(23);
     uint32_t bias_log2_of_pagesize = get_arg_val<uint32_t>(24);
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp
index 581481ff504..5d7279d5629 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_receiver_padding_block_sharded.cpp
@@ -67,7 +67,7 @@ void kernel_main() {
         reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_sender_semaphore_addr);
 
     // L1 array
-    constexpr uint32_t cb_l1_array = tt::CB::c_in5;
+    constexpr uint32_t cb_l1_array = tt::CBIndex::c_5;
     uint32_t in0_mcast_sender_semaphore_valid_addr = get_write_ptr(cb_l1_array);
     volatile tt_l1_ptr uint32_t* in0_mcast_sender_semaphore_valid_addr_ptr =
         reinterpret_cast<volatile tt_l1_ptr uint32_t*>(in0_mcast_sender_semaphore_valid_addr);
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/writer_bmm_single_core_tiled.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/writer_bmm_single_core_tiled.cpp
index e4e3fe333d4..d3fc7aab097 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/writer_bmm_single_core_tiled.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/writer_bmm_single_core_tiled.cpp
@@ -19,7 +19,7 @@ void kernel_main() {
     uint32_t out_num_subblocks_w = get_arg_val<uint32_t>(9);
     uint32_t out_num_subblocks_h = get_arg_val<uint32_t>(10);
 
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
 
     const uint32_t tile_nbytes = get_tile_size(out_cb_id);
     const DataFormat out_df = get_dataformat(out_cb_id);
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
index 39c2a4ce69c..19e443d2fa7 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
@@ -439,9 +439,9 @@ inline MatmulProgramConfig create_simple_matmul_program_config(
                 && mem_config.buffer_type == BufferType::DRAM
                 && num_cores_x == 8 && num_cores_y == 8) {
 
-                in0_block_w = !transpose_mcast ? Kt / num_cores_x : Kt / num_cores_y;
-                per_core_M = !transpose_mcast ? Mt / num_cores_y : Mt / num_cores_x;
-                per_core_N = !transpose_mcast ? Nt / num_cores_x : Nt / num_cores_y;
+                in0_block_w = !transpose_mcast ? (Kt % num_cores_x == 0 ? Kt / num_cores_x : 1) : (Kt % num_cores_x == 0 ? Kt / num_cores_y : 1);
+                per_core_M = !transpose_mcast ? tt::div_up(Mt, num_cores_y) : tt::div_up(Mt, num_cores_x);
+                per_core_N = !transpose_mcast ? tt::div_up(Nt, num_cores_x) : tt::div_up(Nt, num_cores_y);
 
                 auto mutlti_dim_per_core_factor = get_multi_dim_per_core_factor(input_tensor_a, input_tensor_b, per_core_M, per_core_N, in0_block_w, tt_metal::detail::TileSize(tt::DataFormat::Float16_b));
                 out_block_h = mutlti_dim_per_core_factor[0];
@@ -449,7 +449,7 @@ inline MatmulProgramConfig create_simple_matmul_program_config(
                 in0_block_w = mutlti_dim_per_core_factor[2];
 
                 auto subblock_hw = bmm_op_utils::get_matmul_subblock_params(
-                    per_core_M, per_core_N, false, false, false);
+                    out_block_h, out_block_w, false, false, false);
                 out_subblock_h = std::get<0>(subblock_hw);
                 out_subblock_w = std::get<1>(subblock_hw);
             }
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
index ddaeaa504cb..eb24a228191 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_program_factory.cpp
@@ -82,7 +82,7 @@ operation::ProgramWithCallbacks matmul_multi_core(const Tensor &a, const Tensor
             .set_page_size(src1_cb_index, in1_single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, src1_cb_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig output_cb_config =
         tt_metal::CircularBufferConfig(
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
index 1ef14eaf848..cbd527a6bec 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_1d_program_factory.cpp
@@ -585,7 +585,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0(
         tt_metal::CreateCircularBuffer(program, all_cores, cb_for_l1_array_config);
     }
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t interm0_cb_index = 24;
     tt_metal::CircularBufferConfig interm0_cb_config =
         tt_metal::CircularBufferConfig(0, {{interm0_cb_index, interm0_data_format}});
@@ -1344,7 +1344,7 @@ operation::ProgramWithCallbacks create_program_mcast_in1(
         in1_CB_size / in1_single_tile_size,
         in1_CB_size);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t interm0_cb_index = 24;
     tt_metal::CircularBufferConfig interm0_cb_config =
         tt_metal::CircularBufferConfig(0, {{interm0_cb_index, interm0_data_format}});
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
index 70736290e0d..55ec0fc7677 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_2d_program_factory.cpp
@@ -769,7 +769,7 @@ operation::ProgramWithCallbacks create_program_mcast_in0_in1(
         tt_metal::CreateCircularBuffer(program, all_cores, cb_for_l1_array_config);
     }
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t interm0_cb_index = 24;
     tt_metal::CircularBufferConfig interm0_cb_config =
         tt_metal::CircularBufferConfig(0, {{interm0_cb_index, interm0_data_format}});
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
index a4996e3b19c..b6994c59a3c 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp
@@ -853,7 +853,7 @@ operation::ProgramWithCallbacks create_program_dram_sharded(
         in2_CB_size / in0_single_tile_size,
         in2_CB_size);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t interm0_cb_index = 24;
     tt_metal::CircularBufferConfig interm0_cb_config =
         tt_metal::CircularBufferConfig(0, {{interm0_cb_index, interm0_data_format}});
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
index 614d1627844..13cb924f2dc 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_optimized_program_factory.cpp
@@ -296,7 +296,7 @@ operation::ProgramWithCallbacks create_program(
     }
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t interm0_cb_index = 24;
     tt_metal::CircularBufferConfig interm0_cb_config =
         tt_metal::CircularBufferConfig(0, {{interm0_cb_index, interm0_data_format}});
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
index c5e0ee8c241..cc0eb389500 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_program_factory.cpp
@@ -100,7 +100,7 @@ tt_metal::operation::ProgramWithCallbacks create_program(
             .set_page_size(src1_cb_index, in1_single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, src1_cb_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t interm0_cb_index = 24;
     std::map<uint8_t, tt::DataFormat> output_cb_data_format_spec{
         {output_cb_index, out_cb_data_format}, {interm0_cb_index, out_cb_data_format}};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp
new file mode 100644
index 00000000000..cd752419f59
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp
@@ -0,0 +1,91 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "debug/dprint.h"
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+
+namespace NAMESPACE {
+void MAIN {
+    int i{0};
+    const auto num_rows_per_core = get_arg_val<uint32_t>(i++);
+    const auto Wt = get_arg_val<uint32_t>(i++);
+    const auto origin_w = get_arg_val<uint32_t>(i++);
+    const auto p = get_arg_val<uint32_t>(i++);
+    const bool p_is_negative = get_arg_val<uint32_t>(i++) == 1;
+
+    std::uint8_t input_id{tt::CB::c_in0};
+    const auto cb_x = input_id++;        // input
+    const auto cb_one = input_id++;      // one
+    const auto cb_decimal = input_id++;  // decimal
+    const auto cb_mask_w = input_id++;   // mask_w
+
+    std::uint8_t output_id{tt::CB::c_out0};
+    const auto cb_y = output_id++;  // output
+
+    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    const auto cb_tmp0 = intermed_id++;
+    const auto cb_tmp1 = intermed_id++;
+    const auto cb_tmp2 = intermed_id++;
+    const auto cb_tmp3 = intermed_id++;
+
+    const auto cb_xabs = cb_tmp0;      // |x|
+    const auto cb_xpow = cb_tmp1;      // |x|^p
+    const auto cb_logx = cb_tmp2;      // log(|x|)
+    const auto cb_exp_lxmd = cb_tmp3;  // exp(log(|x|) * decimal)
+
+    constexpr uint32_t onetile = 1;
+    constexpr uint32_t dst0 = 0;
+    constexpr uint32_t dst1 = 1;
+
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+
+    cb_wait_front(cb_one, onetile);      // comes from the reader
+    cb_wait_front(cb_decimal, onetile);  // comes from the reader
+
+    constexpr uint32_t TILE_W = 32;
+    const bool do_mask_w = (origin_w % TILE_W) != 0;
+    const auto mask_w = do_mask_w ? (origin_w % TILE_W) : TILE_W;
+
+    if (do_mask_w) {
+        cb_wait_front(cb_mask_w, onetile);  // comes from the reader
+    }
+    for (uint32_t row_idx = 0; row_idx < num_rows_per_core; ++row_idx) {
+        for (uint32_t col_idx = 0; col_idx < Wt; ++col_idx) {
+            // |x|
+            tile_regs_acquire();
+            cb_wait_front(cb_x, onetile);  // comes from the reader
+            cb_reserve_back(cb_xabs, onetile);
+
+            copy_tile_init_with_dt(cb_x);
+            copy_tile(cb_x, 0, dst0);
+
+            if (do_mask_w && (col_idx == Wt - 1)) {
+                copy_tile_init_with_dt(cb_mask_w);
+                copy_tile(cb_mask_w, 0, dst1);
+
+                mask_tile_init();
+                mask_tile(dst0, dst1);
+            }
+
+            abs_tile_init();
+            abs_tile(dst0);
+            tile_regs_commit();
+
+            tile_regs_wait();
+            pack_tile_with_dt(dst0, cb_xabs);
+            tile_regs_release();
+
+            cb_pop_front(cb_x, onetile);
+            cb_push_back(cb_xabs, onetile);
+
+            power_tile_to_cb(cb_xabs, cb_xpow, cb_logx, cb_decimal, cb_exp_lxmd, cb_y, p, p_is_negative);
+        }
+    }
+
+    cb_pop_front(cb_one, onetile);
+    cb_pop_front(cb_decimal, onetile);
+    if (do_mask_w) {
+        cb_pop_front(cb_mask_w, onetile);
+    }
+}  // void MAIN
+}  // namespace NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp
new file mode 100644
index 00000000000..e938be428af
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp
@@ -0,0 +1,62 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+
+void kernel_main() {
+    int i{0};
+    const auto input_addr = get_arg_val<uint32_t>(i++);
+    const bool input_is_dram = get_arg_val<uint32_t>(i++) == 1;
+    const auto decimal = get_arg_val<uint32_t>(i++);
+    const auto num_rows_per_core = get_arg_val<uint32_t>(i++);
+    const auto Wt = get_arg_val<uint32_t>(i++);
+    const auto tile_offset = get_arg_val<uint32_t>(i++);
+    const auto origin_w = get_arg_val<uint32_t>(i++);
+
+    uint32_t cb_id{0};
+    const auto cb_id_input = cb_id++;
+    const auto cb_id_one = cb_id++;
+    const auto cb_id_decimal = cb_id++;
+    const auto cb_id_mask_w = cb_id++;
+
+    const uint32_t input_tile_bytes = get_tile_size(cb_id_input);
+    const auto input_data_format = get_dataformat(cb_id_input);
+
+    const InterleavedAddrGenFast<true> dram_input_addrg = {
+        .bank_base_address = input_addr, .page_size = input_tile_bytes, .data_format = input_data_format};
+
+    const InterleavedAddrGenFast<false> l1_input_addrg = {
+        .bank_base_address = input_addr, .page_size = input_tile_bytes, .data_format = input_data_format};
+
+    Scalar one;
+    one.f = 1.0f;
+    fill_cb_with_value(cb_id_one, one.u);
+    fill_cb_with_value(cb_id_decimal, decimal);
+
+    constexpr uint32_t TILE_W = 32;
+    const bool do_mask_w = (origin_w % TILE_W) != 0;
+    const auto mask_w = do_mask_w ? (origin_w % TILE_W) : TILE_W;
+
+    if (do_mask_w) {
+        generate_mask_w(cb_id_mask_w, mask_w);
+    }
+
+    const auto start_tile_idx = tile_offset;
+    const auto input_l1_write_ptr = get_write_ptr(cb_id_input);
+
+    for (uint32_t row_idx = 0; row_idx < num_rows_per_core; ++row_idx) {
+        for (uint32_t col_idx = 0; col_idx < Wt; ++col_idx) {
+            const auto tile_idx = start_tile_idx + row_idx * Wt + col_idx;
+            cb_reserve_back(cb_id_input, 1);
+            if (input_is_dram) {
+                noc_async_read_tile(tile_idx, dram_input_addrg, input_l1_write_ptr);
+            } else {
+                noc_async_read_tile(tile_idx, l1_input_addrg, input_l1_write_ptr);
+            }
+            noc_async_read_barrier();
+            cb_push_back(cb_id_input, 1);
+        }
+    }
+
+}  // void kernel_main()
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/writer_moreh_abs_pow.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/writer_moreh_abs_pow.cpp
new file mode 100644
index 00000000000..33d164d781c
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/writer_moreh_abs_pow.cpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    int i{0};
+    const auto output_addr = get_arg_val<uint32_t>(i++);
+    const bool output_is_dram = get_arg_val<uint32_t>(i++) == 1;
+    const auto num_rows_per_core = get_arg_val<uint32_t>(i++);
+    const auto Wt = get_arg_val<uint32_t>(i++);
+    const auto tile_offset = get_arg_val<uint32_t>(i++);
+
+    uint32_t cb_id{16};
+    const auto cb_id_output = cb_id++;
+
+    const uint32_t output_tile_bytes = get_tile_size(cb_id_output);
+    const auto output_data_format = get_dataformat(cb_id_output);
+
+    const InterleavedAddrGenFast<true> dram_output_addrg = {
+        .bank_base_address = output_addr, .page_size = output_tile_bytes, .data_format = output_data_format};
+
+    const InterleavedAddrGenFast<false> l1_output_addrg = {
+        .bank_base_address = output_addr, .page_size = output_tile_bytes, .data_format = output_data_format};
+
+    const auto start_tile_idx = tile_offset;
+    const auto output_l1_read_addr = get_read_ptr(cb_id_output);
+
+    for (uint32_t row_idx = 0; row_idx < num_rows_per_core; ++row_idx) {
+        for (uint32_t col_idx = 0; col_idx < Wt; ++col_idx) {
+            const auto tile_idx = start_tile_idx + row_idx * Wt + col_idx;
+            cb_wait_front(cb_id_output, 1);
+            if (output_is_dram) {
+                noc_async_write_tile(tile_idx, dram_output_addrg, output_l1_read_addr);
+            } else {
+                noc_async_write_tile(tile_idx, l1_output_addrg, output_l1_read_addr);
+            }
+            noc_async_write_barrier();
+            cb_pop_front(cb_id_output, 1);
+        }
+    }
+}  // void kernel_main()
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.cpp
new file mode 100644
index 00000000000..61b4be98bc0
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.cpp
@@ -0,0 +1,84 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "moreh_abs_pow_device_operation.hpp"
+
+#include "ttnn/operations/moreh/moreh_helper_functions.hpp"
+#include "ttnn/tensor/tensor.hpp"
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+
+std::tuple<uint32_t, float, bool> get_floored_p_and_decimal_and_p_is_negative(float p) {
+    auto floored_p = std::floor(p);
+    auto decimal = p - floored_p;
+    bool p_is_negative = floored_p < 0.0f;
+    if (p_is_negative)
+        floored_p = -floored_p;
+    return std::make_tuple(static_cast<uint32_t>(floored_p), decimal, p_is_negative);
+}
+
+MorehAbsPowOperation::program_factory_t MorehAbsPowOperation::select_program_factory(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    // Case for int32
+    return MorehAbsPowFactory{};
+}
+
+void validate_tensors(
+    const MorehAbsPowOperation::operation_attributes_t& operation_attributes,
+    const MorehAbsPowOperation::tensor_args_t& tensor_args) {
+    const auto& input = tensor_args.input;
+    auto& output = tensor_args.output;
+
+    check_tensor(input, "moreh_abs_pow", "input", {DataType::BFLOAT16, DataType::INT32});
+    check_tensor(output, "moreh_abs_pow", "output", {DataType::BFLOAT16, DataType::INT32});
+}
+
+void MorehAbsPowOperation::validate_on_program_cache_miss(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    validate_tensors(operation_attributes, tensor_args);
+};
+
+void MorehAbsPowOperation::validate_on_program_cache_hit(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    validate_tensors(operation_attributes, tensor_args);
+};
+MorehAbsPowOperation::shape_return_value_t MorehAbsPowOperation::compute_output_shapes(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    const auto& input = tensor_args.input;
+    const auto& input_shape = input.get_shape();
+    return input_shape;
+};
+
+MorehAbsPowOperation::tensor_return_value_t MorehAbsPowOperation::create_output_tensors(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    if (tensor_args.output.has_value()) {
+        log_debug(tt::LogOp, "{}:{} use output tensor", __func__, __LINE__);
+        return {tensor_args.output.value()};
+    }
+
+    log_debug(tt::LogOp, "{}:{} create output tensor", __func__, __LINE__);
+    return create_device_tensor(
+        compute_output_shapes(operation_attributes, tensor_args),
+        tensor_args.input.get_dtype(),
+        tensor_args.input.get_layout(),
+        tensor_args.input.device(),
+        operation_attributes.memory_config);
+};
+
+std::tuple<MorehAbsPowOperation::operation_attributes_t, MorehAbsPowOperation::tensor_args_t>
+MorehAbsPowOperation::invoke(
+    const Tensor& input,
+    const float p,
+    const std::optional<Tensor>& output,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<DeviceComputeKernelConfig>& compute_kernel_config) {
+    const operation_attributes_t operation_attributes{
+        p,
+        memory_config.value_or(input.memory_config()),
+        init_device_compute_kernel_config(input.device()->arch(), compute_kernel_config, MathFidelity::HiFi4)};
+    const tensor_args_t tensor_args{input, output};
+
+    return {operation_attributes, tensor_args};
+}
+}  // namespace ttnn::operations::moreh::moreh_abs_pow
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp
new file mode 100644
index 00000000000..586c82d16c1
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <variant>
+
+#include "ttnn/decorators.hpp"
+#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
+#include "ttnn/tensor/types.hpp"
+
+#define MOREH_ABS_POW_FACTORY_H(name)                                                       \
+    struct name {                                                                           \
+        struct shared_variables_t {                                                         \
+            KernelHandle reader_kernels_id;                                                 \
+            KernelHandle writer_kernels_id;                                                 \
+            std::size_t num_cores_to_be_used;                                               \
+            std::size_t num_cores_y;                                                        \
+        };                                                                                  \
+                                                                                            \
+        using cached_program_t = ttnn::device_operation::CachedProgram<shared_variables_t>; \
+                                                                                            \
+        static cached_program_t create(                                                     \
+            const operation_attributes_t& operation_attributes,                             \
+            const tensor_args_t& tensor_args,                                               \
+            tensor_return_value_t& output_tensor);                                          \
+                                                                                            \
+        static void override_runtime_arguments(                                             \
+            cached_program_t& cached_program,                                               \
+            const operation_attributes_t& operation_attributes,                             \
+            const tensor_args_t& tensor_args,                                               \
+            tensor_return_value_t& output_tensor);                                          \
+    };
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+
+std::tuple<uint32_t, float, bool> get_floored_p_and_decimal_and_p_is_negative(float p);
+
+struct MorehAbsPowOperation {
+    struct operation_attributes_t {
+        const float p;
+
+        const MemoryConfig memory_config;
+        const DeviceComputeKernelConfig compute_kernel_config;
+    };
+    struct tensor_args_t {
+        const Tensor& input;
+        const std::optional<Tensor>& output;
+    };
+
+    using shape_return_value_t = Shape;
+    using tensor_return_value_t = Tensor;
+
+    MOREH_ABS_POW_FACTORY_H(MorehAbsPowFactory)
+
+    using program_factory_t = std::variant<MorehAbsPowFactory>;
+    static program_factory_t select_program_factory(const operation_attributes_t&, const tensor_args_t&);
+    static void validate_on_program_cache_miss(const operation_attributes_t&, const tensor_args_t&);
+    static void validate_on_program_cache_hit(const operation_attributes_t&, const tensor_args_t&);
+    static shape_return_value_t compute_output_shapes(const operation_attributes_t&, const tensor_args_t&);
+    static tensor_return_value_t create_output_tensors(const operation_attributes_t&, const tensor_args_t&);
+    static std::tuple<operation_attributes_t, tensor_args_t> invoke(
+        const Tensor& input,
+        const float p,
+        const std::optional<Tensor>& output,
+        const std::optional<MemoryConfig>& memory_config,
+        const std::optional<DeviceComputeKernelConfig>& compute_kernel_config);
+};
+
+}  // namespace ttnn::operations::moreh::moreh_abs_pow
+
+namespace ttnn::prim {
+constexpr auto moreh_abs_pow = ttnn::
+    register_operation<"ttnn::prim::moreh_abs_pow", ttnn::operations::moreh::moreh_abs_pow::MorehAbsPowOperation>();
+}  // namespace ttnn::prim
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_program_factory.cpp
new file mode 100644
index 00000000000..a5fd5035567
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_program_factory.cpp
@@ -0,0 +1,205 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "moreh_abs_pow_device_operation.hpp"
+#include "tt_metal/common/work_split.hpp"
+#include "ttnn/operations/moreh/moreh_helper_functions.hpp"
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+MorehAbsPowOperation::MorehAbsPowFactory::cached_program_t MorehAbsPowOperation::MorehAbsPowFactory::create(
+    const operation_attributes_t &operation_attributes,
+    const tensor_args_t &tensor_args,
+    tensor_return_value_t &output) {
+    const auto &input = tensor_args.input;
+    const auto p = operation_attributes.p;
+    ////////////////////////////////////////////////////////////////////////////
+    //                      Device Setup
+    ////////////////////////////////////////////////////////////////////////////
+    auto device = input.device();
+    auto program = CreateProgram();
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                         Parameters Setup
+    ////////////////////////////////////////////////////////////////////////////
+    const auto input_shape = input.get_legacy_shape();
+    const auto input_rank = input_shape.rank();
+
+    const auto H = input_shape[-2];
+    const auto W = input_shape[-1];
+
+    const auto Ht = H / tt::constants::TILE_HEIGHT;
+    const auto Wt = W / tt::constants::TILE_WIDTH;
+
+    const auto num_units = input.volume() / H / W * Ht;
+
+    const auto origin_w = input_shape.without_padding()[input_rank - 1];
+
+    auto [floored_p, decimal, p_is_negative] = get_floored_p_and_decimal_and_p_is_negative(p);
+    ////////////////////////////////////////////////////////////////////////////
+    //                         Core Setup
+    ////////////////////////////////////////////////////////////////////////////
+    auto grid = device->compute_with_storage_grid_size();
+    const auto num_cores_y = grid.y;
+
+    auto arch = input.device()->arch();
+    auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc, dst_full_sync_en] =
+        get_compute_kernel_config_args(arch, operation_attributes.compute_kernel_config);
+
+    const auto
+        [num_cores_to_be_used,
+         all_cores,
+         core_group_1,
+         core_group_2,
+         num_units_per_core_group_1,
+         num_units_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_units);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                         CircularBuffer Setup
+    ////////////////////////////////////////////////////////////////////////////
+    const auto cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input.get_dtype());
+    const auto intermed_data_format = fp32_dest_acc_en ? tt::DataFormat::Float32 : cb_data_format;
+
+    const uint32_t in0_t{1};  // input
+    const uint32_t in1_t{1};  // one
+    const uint32_t in2_t{1};  // recip_p_decimal
+    const uint32_t in3_t{1};  // mask_w
+
+    const uint32_t out0_t{1};  // output
+
+    const uint32_t im0_t{1};  // |x|
+    const uint32_t im1_t{1};  // log(|x|)
+    const uint32_t im2_t{1};  // exp(log(|x|) * decimal)
+    const uint32_t im3_t{1};  // |x|^p
+
+    CreateCircularBuffer(
+        program,
+        all_cores,
+        cb_data_format,
+        {
+            {tt::CB::c_in0, in0_t},    // input
+            {tt::CB::c_in1, in1_t},    // one
+            {tt::CB::c_in2, in2_t},    // recip_p_decimal
+            {tt::CB::c_in3, in3_t},    // mask_w
+            {tt::CB::c_out0, out0_t},  // output
+            {tt::CB::c_intermed0, im0_t, intermed_data_format},
+            {tt::CB::c_intermed1, im1_t, intermed_data_format},
+            {tt::CB::c_intermed2, im2_t, intermed_data_format},
+            {tt::CB::c_intermed3, im3_t, intermed_data_format},
+        });
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      DataMovementKernel SetUp
+    ////////////////////////////////////////////////////////////////////////////
+    const auto reader_kernel_file =
+        "ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/"
+        "reader_moreh_abs_pow.cpp";
+    const auto writer_kernel_file =
+        "ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/"
+        "writer_moreh_abs_pow.cpp";
+
+    const auto reader_kernels_id = CreateReadKernel(program, reader_kernel_file, all_cores);
+    const auto writer_kernels_id = CreateWriteKernel(program, writer_kernel_file, all_cores);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      ComputeKernel SetUp
+    ////////////////////////////////////////////////////////////////////////////
+    std::map<std::string, std::string> compute_defines{};
+
+    const auto compute_kernel_file =
+        "ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/device/kernels/"
+        "moreh_abs_pow_kernel.cpp";
+
+    const auto compute_kernels_id_1 = CreateComputeKernel(
+        program,
+        compute_kernel_file,
+        {core_group_1, num_units_per_core_group_1},
+        compute_defines,
+        math_fidelity,
+        fp32_dest_acc_en,
+        math_approx_mode);
+
+    KernelHandle compute_kernels_id_2{0};
+    if (!core_group_2.ranges().empty()) {
+        compute_kernels_id_2 = CreateComputeKernel(
+            program,
+            compute_kernel_file,
+            {core_group_2, num_units_per_core_group_2},
+            compute_defines,
+            math_fidelity,
+            fp32_dest_acc_en,
+            math_approx_mode);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    //                      RuntimeArgs SetUp
+    ////////////////////////////////////////////////////////////////////////////
+    for (uint32_t i = 0, tile_offset = 0; i < num_cores_to_be_used; ++i) {
+        CoreCoord core = {i / num_cores_y, i % num_cores_y};
+
+        uint32_t num_units_per_core;
+        KernelHandle compute_kernel_id;
+        if (core_group_1.contains(core)) {
+            num_units_per_core = num_units_per_core_group_1;
+            compute_kernel_id = compute_kernels_id_1;
+        } else if (core_group_2.contains(core)) {
+            num_units_per_core = num_units_per_core_group_2;
+            compute_kernel_id = compute_kernels_id_2;
+        } else {
+            TT_THROW("Core not in specified core ranges.");
+        }
+
+        // reader
+        const std::vector<uint32_t> reader_runtime_args{
+            input.buffer()->address(),
+            static_cast<uint32_t>(is_dram(input)),
+            *reinterpret_cast<uint32_t *>(&decimal),
+            num_units_per_core,
+            Wt,
+            tile_offset,
+            origin_w};
+        SetRuntimeArgs(program, reader_kernels_id, core, reader_runtime_args);
+
+        // writer
+        const std::vector<uint32_t> writer_runtime_args{
+            output.buffer()->address(), static_cast<uint32_t>(is_dram(output)), num_units_per_core, Wt, tile_offset};
+        SetRuntimeArgs(program, writer_kernels_id, core, writer_runtime_args);
+
+        // compute
+        const std::vector<uint32_t> compute_runtime_args{
+            num_units_per_core, Wt, origin_w, floored_p, static_cast<uint32_t>(p_is_negative)};
+        SetRuntimeArgs(program, compute_kernel_id, core, compute_runtime_args);
+
+        tile_offset += num_units_per_core * Wt;
+    }
+
+    return {std::move(program), {reader_kernels_id, writer_kernels_id, num_cores_to_be_used, num_cores_y}};
+}
+
+void MorehAbsPowOperation::MorehAbsPowFactory::override_runtime_arguments(
+    cached_program_t &cached_program,
+    const operation_attributes_t &operation_attributes,
+    const tensor_args_t &tensor_args,
+    tensor_return_value_t &output) {
+    auto &program = cached_program.program;
+    auto &reader_kernels_id = cached_program.shared_variables.reader_kernels_id;
+    auto &writer_kernels_id = cached_program.shared_variables.writer_kernels_id;
+    auto &num_cores_to_be_used = cached_program.shared_variables.num_cores_to_be_used;
+    auto &num_cores_y = cached_program.shared_variables.num_cores_y;
+
+    for (uint32_t icore = 0; icore < num_cores_to_be_used; icore++) {
+        CoreCoord core = {icore / num_cores_y, icore % num_cores_y};
+        // readers
+        {
+            auto &runtime_args = GetRuntimeArgs(program, reader_kernels_id, core);
+            runtime_args[0] = tensor_args.input.buffer()->address();
+        }
+
+        // writer
+        {
+            auto &runtime_args = GetRuntimeArgs(program, writer_kernels_id, core);
+            runtime_args[0] = output.buffer()->address();
+        }
+    }
+}
+}  // namespace ttnn::operations::moreh::moreh_abs_pow
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.cpp
new file mode 100644
index 00000000000..fe5e8bc58e8
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.cpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "moreh_abs_pow.hpp"
+
+#include "ttnn/operations/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp"
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+Tensor MorehAbsPow::invoke(
+    const Tensor& input,
+    const float p,
+    const std::optional<Tensor>& output,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<DeviceComputeKernelConfig>& compute_kernel_config) {
+    return ttnn::prim::moreh_abs_pow(input, p, output, memory_config, compute_kernel_config);
+}
+}  // namespace ttnn::operations::moreh::moreh_abs_pow
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.hpp
new file mode 100644
index 00000000000..22eede6bf85
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.hpp
@@ -0,0 +1,25 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ttnn/decorators.hpp"
+#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+struct MorehAbsPow {
+    static Tensor invoke(
+        const Tensor& input,
+        const float p,
+        const std::optional<Tensor>& output,
+        const std::optional<MemoryConfig>& memory_config,
+        const std::optional<DeviceComputeKernelConfig>& compute_kernel_config);
+};
+}  // namespace ttnn::operations::moreh::moreh_abs_pow
+
+namespace ttnn {
+constexpr auto moreh_abs_pow = ttnn::register_operation_with_auto_launch_op<
+    "ttnn::moreh_abs_pow",
+    ttnn::operations::moreh::moreh_abs_pow::MorehAbsPow>();
+}
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow_pybind.cpp
new file mode 100644
index 00000000000..40048c2042d
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow_pybind.cpp
@@ -0,0 +1,25 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "moreh_abs_pow_pybind.hpp"
+
+#include "moreh_abs_pow.hpp"
+#include "pybind11/decorators.hpp"
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+void bind_moreh_abs_pow_operation(py::module &module) {
+    bind_registered_operation(
+        module,
+        ttnn::moreh_abs_pow,
+        "Moreh Pow Operation",
+        ttnn::pybind_arguments_t{
+            py::arg("input"),
+            py::arg("p"),
+            py::kw_only(),
+            py::arg("output") = std::nullopt,
+            py::arg("memory_config") = std::nullopt,
+            py::arg("compute_kernel_config") = std::nullopt,
+        });
+}
+}  // namespace ttnn::operations::moreh::moreh_pow
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow_pybind.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow_pybind.hpp
new file mode 100644
index 00000000000..44c0aed18e1
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow_pybind.hpp
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "pybind11/pybind_fwd.hpp"
+
+namespace py = pybind11;
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+void bind_moreh_abs_pow_operation(py::module& module);
+}  // namespace ttnn::operations::moreh::moreh_abs_pow
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/moreh_adam.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/moreh_adam.cpp
index 25c9a09cfca..2fed5e3c98a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/moreh_adam.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/moreh_adam.cpp
@@ -23,31 +23,31 @@ void MAIN {
     uint32_t step = get_arg_val<uint32_t>(0);
     constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
 
-    constexpr auto cb_param_in = tt::CB::c_in0;
-    constexpr auto cb_grad_in = tt::CB::c_in1;
-    constexpr auto cb_exp_avg_in = tt::CB::c_in2;
-    constexpr auto cb_exp_avg_sq_in = tt::CB::c_in3;
+    constexpr auto cb_param_in = tt::CBIndex::c_0;
+    constexpr auto cb_grad_in = tt::CBIndex::c_1;
+    constexpr auto cb_exp_avg_in = tt::CBIndex::c_2;
+    constexpr auto cb_exp_avg_sq_in = tt::CBIndex::c_3;
 #ifdef AMSGRAD
-    constexpr auto cb_max_exp_avg_sq_in = tt::CB::c_in4;
+    constexpr auto cb_max_exp_avg_sq_in = tt::CBIndex::c_4;
 #endif
     // lr, beta1, beta2, eps, weight_decay
-    constexpr auto cb_scalar_args = tt::CB::c_in5;
-    constexpr auto cb_one = tt::CB::c_in6;
-    constexpr auto cb_param_out = tt::CB::c_out0;
-    constexpr auto cb_exp_avg_out = tt::CB::c_out1;
-    constexpr auto cb_exp_avg_sq_out = tt::CB::c_out2;
+    constexpr auto cb_scalar_args = tt::CBIndex::c_5;
+    constexpr auto cb_one = tt::CBIndex::c_6;
+    constexpr auto cb_param_out = tt::CBIndex::c_16;
+    constexpr auto cb_exp_avg_out = tt::CBIndex::c_17;
+    constexpr auto cb_exp_avg_sq_out = tt::CBIndex::c_18;
 #ifdef AMSGRAD
-    constexpr auto cb_max_exp_avg_sq_out = tt::CB::c_out3;
+    constexpr auto cb_max_exp_avg_sq_out = tt::CBIndex::c_19;
 #endif
 
-    constexpr auto tmp_cb_grad = tt::CB::c_intermed0;
-    constexpr auto tmp_cb_exp_avg = tt::CB::c_intermed1;
-    constexpr auto tmp_cb_exp_avg_sq = tt::CB::c_intermed2;
+    constexpr auto tmp_cb_grad = tt::CBIndex::c_24;
+    constexpr auto tmp_cb_exp_avg = tt::CBIndex::c_25;
+    constexpr auto tmp_cb_exp_avg_sq = tt::CBIndex::c_26;
 #ifdef AMSGRAD
-    constexpr auto tmp_cb_max_exp_avg_sq = tt::CB::c_intermed3;
+    constexpr auto tmp_cb_max_exp_avg_sq = tt::CBIndex::c_27;
 #endif
-    constexpr auto cb_tmp1 = tt::CB::c_intermed6;
-    constexpr auto cb_tmp2 = tt::CB::c_intermed7;
+    constexpr auto cb_tmp1 = tt::CBIndex::c_30;
+    constexpr auto cb_tmp2 = tt::CBIndex::c_31;
 
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/reader_moreh_adam.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/reader_moreh_adam.cpp
index 7955f7193a2..9fc5c3bbe23 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/reader_moreh_adam.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/reader_moreh_adam.cpp
@@ -41,14 +41,14 @@ void kernel_main() {
     const auto num_tiles_per_core = get_arg_val<uint32_t>(12);
     const auto start_id = get_arg_val<uint32_t>(13);
 
-    constexpr uint32_t cb_id_param = tt::CB::c_in0;
-    constexpr uint32_t cb_id_grad = tt::CB::c_in1;
-    constexpr uint32_t cb_id_exp_avg = tt::CB::c_in2;
-    constexpr uint32_t cb_id_exp_avg_sq = tt::CB::c_in3;
+    constexpr uint32_t cb_id_param = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_grad = tt::CBIndex::c_1;
+    constexpr uint32_t cb_id_exp_avg = tt::CBIndex::c_2;
+    constexpr uint32_t cb_id_exp_avg_sq = tt::CBIndex::c_3;
 
     // lr, beta1, beta2, eps, weight_decay
-    constexpr uint32_t cb_scalar_args = tt::CB::c_in5;
-    constexpr uint32_t cb_id_one = tt::CB::c_in6;
+    constexpr uint32_t cb_scalar_args = tt::CBIndex::c_5;
+    constexpr uint32_t cb_id_one = tt::CBIndex::c_6;
 
     const uint32_t param_tile_bytes = get_tile_size(cb_id_param);
     const auto param_data_format = get_dataformat(cb_id_param);
@@ -83,7 +83,7 @@ void kernel_main() {
         .data_format = exp_avg_sq_data_format};
 
 #ifdef AMSGRAD
-    constexpr uint32_t cb_id_max_exp_avg_sq = tt::CB::c_in4;
+    constexpr uint32_t cb_id_max_exp_avg_sq = tt::CBIndex::c_4;
     const auto max_exp_avg_sq_addr = get_arg_val<uint32_t>(4);
     const uint32_t max_exp_avg_sq_tile_bytes = get_tile_size(cb_id_max_exp_avg_sq);
     const auto max_exp_avg_sq_data_format = get_dataformat(cb_id_max_exp_avg_sq);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/writer_moreh_adam.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/writer_moreh_adam.cpp
index f681ea769be..dad08b4eef7 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/writer_moreh_adam.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/kernels/writer_moreh_adam.cpp
@@ -12,9 +12,9 @@ void kernel_main() {
     const auto num_tiles_per_core = get_arg_val<uint32_t>(4);
     const auto start_id = get_arg_val<uint32_t>(5);
 
-    constexpr uint32_t cb_id_param = tt::CB::c_out0;
-    constexpr uint32_t cb_id_exp_avg = tt::CB::c_out1;
-    constexpr uint32_t cb_id_exp_avg_sq = tt::CB::c_out2;
+    constexpr uint32_t cb_id_param = tt::CBIndex::c_16;
+    constexpr uint32_t cb_id_exp_avg = tt::CBIndex::c_17;
+    constexpr uint32_t cb_id_exp_avg_sq = tt::CBIndex::c_18;
 
     const uint32_t param_tile_bytes = get_tile_size(cb_id_param);
     const auto param_data_format = get_dataformat(cb_id_param);
@@ -42,7 +42,7 @@ void kernel_main() {
         .bank_base_address = exp_avg_sq_addr, .page_size = exp_avg_sq_tile_bytes, .data_format = exp_avg_sq_data_format};
 
 #ifdef AMSGRAD
-    constexpr uint32_t cb_id_max_exp_avg_sq = tt::CB::c_out3;
+    constexpr uint32_t cb_id_max_exp_avg_sq = tt::CBIndex::c_19;
     const auto max_exp_avg_sq_addr = get_arg_val<uint32_t>(3);
     const uint32_t max_exp_avg_sq_tile_bytes = get_tile_size(cb_id_max_exp_avg_sq);
     const auto max_exp_avg_sq_data_format = get_dataformat(cb_id_max_exp_avg_sq);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp
index 17206902fcf..fc64989864e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/device/moreh_adam_program_factory.cpp
@@ -66,27 +66,27 @@ MorehAdamOperation::ProgramFactory::cached_program_t MorehAdamOperation::Program
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 1},                      // param_in
-            {tt::CB::c_in1, 1},                      // grad
-            {tt::CB::c_in2, 1},                      // exp_avg_in
-            {tt::CB::c_in3, 1},                      // exp_avg_sq_in
-            {tt::CB::c_in4, 1},                      // max_exp_avg_sq_in (optional)
-            {tt::CB::c_in5, 5, intermed_cb_format},  // lr, beta1, beta2, eps, weight_decay
-            {tt::CB::c_in6, 1, intermed_cb_format},  // 1.0f
-
-            {tt::CB::c_intermed0, 1, intermed_cb_format},  // tmp_grad
-            {tt::CB::c_intermed1, 1, intermed_cb_format},  // tmp_exp_avg
-            {tt::CB::c_intermed2, 1, intermed_cb_format},  // tmp_exp_avg_sq
-            {tt::CB::c_intermed3, 1, intermed_cb_format},  // tmp_max_exp_avg_sq
-            {tt::CB::c_intermed4, 1, intermed_cb_format},  //
-            {tt::CB::c_intermed5, 1, intermed_cb_format},  //
-            {tt::CB::c_intermed6, 1, intermed_cb_format},  // tmp1
-            {tt::CB::c_intermed7, 1, intermed_cb_format},  // tmp2
-
-            {tt::CB::c_out0, 1},  // param_out
-            {tt::CB::c_out1, 1},  // exp_avg_out
-            {tt::CB::c_out2, 1},  // exp_avg_sq_out
-            {tt::CB::c_out3, 1},  // max_exp_avg_sq_out (optional)
+            {tt::CBIndex::c_0, 1},                      // param_in
+            {tt::CBIndex::c_1, 1},                      // grad
+            {tt::CBIndex::c_2, 1},                      // exp_avg_in
+            {tt::CBIndex::c_3, 1},                      // exp_avg_sq_in
+            {tt::CBIndex::c_4, 1},                      // max_exp_avg_sq_in (optional)
+            {tt::CBIndex::c_5, 5, intermed_cb_format},  // lr, beta1, beta2, eps, weight_decay
+            {tt::CBIndex::c_6, 1, intermed_cb_format},  // 1.0f
+
+            {tt::CBIndex::c_24, 1, intermed_cb_format},  // tmp_grad
+            {tt::CBIndex::c_25, 1, intermed_cb_format},  // tmp_exp_avg
+            {tt::CBIndex::c_26, 1, intermed_cb_format},  // tmp_exp_avg_sq
+            {tt::CBIndex::c_27, 1, intermed_cb_format},  // tmp_max_exp_avg_sq
+            {tt::CBIndex::c_28, 1, intermed_cb_format},  //
+            {tt::CBIndex::c_29, 1, intermed_cb_format},  //
+            {tt::CBIndex::c_30, 1, intermed_cb_format},  // tmp1
+            {tt::CBIndex::c_31, 1, intermed_cb_format},  // tmp2
+
+            {tt::CBIndex::c_16, 1},  // param_out
+            {tt::CBIndex::c_17, 1},  // exp_avg_out
+            {tt::CBIndex::c_18, 1},  // exp_avg_sq_out
+            {tt::CBIndex::c_19, 1},  // max_exp_avg_sq_out (optional)
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/moreh_adam.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/moreh_adam.cpp
index a22063a68a7..207f284ac73 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adam/moreh_adam.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adam/moreh_adam.cpp
@@ -7,6 +7,8 @@
 #include "ttnn/operations/moreh/moreh_adam/device/moreh_adam_device_operation.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_adam {
 std::vector<std::optional<Tensor>> MorehAdam::invoke(
     const Tensor& param_in,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/moreh_adamw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/moreh_adamw.cpp
index af9b50bd465..4d20af27664 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/moreh_adamw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/moreh_adamw.cpp
@@ -17,33 +17,33 @@ void MAIN {
     uint32_t step = get_arg_val<uint32_t>(0);
     constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
 
-    constexpr auto cb_param_in = tt::CB::c_in0;
-    constexpr auto cb_grad_in = tt::CB::c_in1;
-    constexpr auto cb_exp_avg_in = tt::CB::c_in2;
-    constexpr auto cb_exp_avg_sq_in = tt::CB::c_in3;
+    constexpr auto cb_param_in = tt::CBIndex::c_0;
+    constexpr auto cb_grad_in = tt::CBIndex::c_1;
+    constexpr auto cb_exp_avg_in = tt::CBIndex::c_2;
+    constexpr auto cb_exp_avg_sq_in = tt::CBIndex::c_3;
 #ifdef AMSGRAD
-    constexpr auto cb_max_exp_avg_sq_in = tt::CB::c_in4;
+    constexpr auto cb_max_exp_avg_sq_in = tt::CBIndex::c_4;
 #endif
     // lr, beta1, beta2, eps, weight_decay
-    constexpr auto cb_scalar_args = tt::CB::c_in5;
-    constexpr auto cb_one = tt::CB::c_in6;
-    constexpr auto cb_param_out = tt::CB::c_out0;
-    constexpr auto cb_exp_avg_out = tt::CB::c_out1;
-    constexpr auto cb_exp_avg_sq_out = tt::CB::c_out2;
+    constexpr auto cb_scalar_args = tt::CBIndex::c_5;
+    constexpr auto cb_one = tt::CBIndex::c_6;
+    constexpr auto cb_param_out = tt::CBIndex::c_16;
+    constexpr auto cb_exp_avg_out = tt::CBIndex::c_17;
+    constexpr auto cb_exp_avg_sq_out = tt::CBIndex::c_18;
 #ifdef AMSGRAD
-    constexpr auto cb_max_exp_avg_sq_out = tt::CB::c_out3;
+    constexpr auto cb_max_exp_avg_sq_out = tt::CBIndex::c_19;
 #endif
 
-    constexpr auto tmp_cb_param = tt::CB::c_intermed0;
-    constexpr auto tmp_cb_exp_avg = tt::CB::c_intermed1;
-    constexpr auto tmp_cb_exp_avg_sq = tt::CB::c_intermed2;
+    constexpr auto tmp_cb_param = tt::CBIndex::c_24;
+    constexpr auto tmp_cb_exp_avg = tt::CBIndex::c_25;
+    constexpr auto tmp_cb_exp_avg_sq = tt::CBIndex::c_26;
 #ifdef AMSGRAD
-    constexpr auto tmp_cb_max_exp_avg_sq = tt::CB::c_intermed3;
+    constexpr auto tmp_cb_max_exp_avg_sq = tt::CBIndex::c_27;
 #endif
-    constexpr auto cb_beta1_exponent = tt::CB::c_intermed4;
-    constexpr auto cb_beta2_exponent = tt::CB::c_intermed5;
-    constexpr auto cb_tmp1 = tt::CB::c_intermed6;
-    constexpr auto cb_tmp2 = tt::CB::c_intermed7;
+    constexpr auto cb_beta1_exponent = tt::CBIndex::c_28;
+    constexpr auto cb_beta2_exponent = tt::CBIndex::c_29;
+    constexpr auto cb_tmp1 = tt::CBIndex::c_30;
+    constexpr auto cb_tmp2 = tt::CBIndex::c_31;
 
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/reader_moreh_adamw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/reader_moreh_adamw.cpp
index d8c62072147..581e0e5e894 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/reader_moreh_adamw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/reader_moreh_adamw.cpp
@@ -28,17 +28,17 @@ void kernel_main() {
     const auto num_tiles_per_core = get_arg_val<uint32_t>(i++);
     const auto start_id = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_id_param = tt::CB::c_in0;
-    constexpr uint32_t cb_id_grad = tt::CB::c_in1;
-    constexpr uint32_t cb_id_exp_avg = tt::CB::c_in2;
-    constexpr uint32_t cb_id_exp_avg_sq = tt::CB::c_in3;
+    constexpr uint32_t cb_id_param = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_grad = tt::CBIndex::c_1;
+    constexpr uint32_t cb_id_exp_avg = tt::CBIndex::c_2;
+    constexpr uint32_t cb_id_exp_avg_sq = tt::CBIndex::c_3;
 
     // lr, beta1, beta2, eps, weight_decay
-    constexpr uint32_t cb_scalar_args = tt::CB::c_in5;
-    constexpr uint32_t cb_id_one = tt::CB::c_in6;
+    constexpr uint32_t cb_scalar_args = tt::CBIndex::c_5;
+    constexpr uint32_t cb_id_one = tt::CBIndex::c_6;
 
-    constexpr uint32_t cb_beta1_exponent = tt::CB::c_intermed4;
-    constexpr uint32_t cb_beta2_exponent = tt::CB::c_intermed5;
+    constexpr uint32_t cb_beta1_exponent = tt::CBIndex::c_28;
+    constexpr uint32_t cb_beta2_exponent = tt::CBIndex::c_29;
 
     const uint32_t param_tile_bytes = get_tile_size(cb_id_param);
     const auto param_data_format = get_dataformat(cb_id_param);
@@ -73,7 +73,7 @@ void kernel_main() {
         .data_format = exp_avg_sq_data_format};
 
 #ifdef AMSGRAD
-    constexpr uint32_t cb_id_max_exp_avg_sq = tt::CB::c_in4;
+    constexpr uint32_t cb_id_max_exp_avg_sq = tt::CBIndex::c_4;
     const uint32_t max_exp_avg_sq_tile_bytes = get_tile_size(cb_id_max_exp_avg_sq);
     const auto max_exp_avg_sq_data_format = get_dataformat(cb_id_max_exp_avg_sq);
     const InterleavedAddrGenFast<max_exp_avg_sq_is_dram> max_exp_avg_sq_addrg = {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/writer_moreh_adamw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/writer_moreh_adamw.cpp
index ae5cb4477a4..b0b0d3c14e6 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/writer_moreh_adamw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/kernels/writer_moreh_adamw.cpp
@@ -14,9 +14,9 @@ void kernel_main() {
     const auto num_tiles_per_core = get_arg_val<uint32_t>(i++);
     const auto start_id = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_id_param = tt::CB::c_out0;
-    constexpr uint32_t cb_id_exp_avg = tt::CB::c_out1;
-    constexpr uint32_t cb_id_exp_avg_sq = tt::CB::c_out2;
+    constexpr uint32_t cb_id_param = tt::CBIndex::c_16;
+    constexpr uint32_t cb_id_exp_avg = tt::CBIndex::c_17;
+    constexpr uint32_t cb_id_exp_avg_sq = tt::CBIndex::c_18;
 
     const uint32_t param_tile_bytes = get_tile_size(cb_id_param);
     const auto param_data_format = get_dataformat(cb_id_param);
@@ -44,7 +44,7 @@ void kernel_main() {
         .bank_base_address = exp_avg_sq_addr, .page_size = exp_avg_sq_tile_bytes, .data_format = exp_avg_sq_data_format};
 
 #ifdef AMSGRAD
-    constexpr uint32_t cb_id_max_exp_avg_sq = tt::CB::c_out3;
+    constexpr uint32_t cb_id_max_exp_avg_sq = tt::CBIndex::c_19;
     const uint32_t max_exp_avg_sq_tile_bytes = get_tile_size(cb_id_max_exp_avg_sq);
     const auto max_exp_avg_sq_data_format = get_dataformat(cb_id_max_exp_avg_sq);
     const InterleavedAddrGenFast<max_exp_avg_sq_is_dram> max_exp_avg_sq_addrg = {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/multi_core_program_factory.cpp
index 2bcd605d6a9..8940419ba5f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/device/multi_core_program_factory.cpp
@@ -69,27 +69,27 @@ MorehAdamWDeviceOperation::MultiCore::cached_program_t MorehAdamWDeviceOperation
         all_cores,
         data_format,
         {
-            {CB::c_in0, 1},                      // param_in
-            {CB::c_in1, 1},                      // grad
-            {CB::c_in2, 1},                      // exp_avg_in
-            {CB::c_in3, 1},                      // exp_avg_sq_in
-            {CB::c_in4, 1},                      // max_exp_avg_sq_in (optional)
-            {CB::c_in5, 5, intermed_cb_format},  // lr, beta1, beta2, eps, weight_decay
-            {CB::c_in6, 1, intermed_cb_format},  // 1.0f
-
-            {CB::c_intermed0, 1, intermed_cb_format},  // tmp_grad
-            {CB::c_intermed1, 1, intermed_cb_format},  // tmp_exp_avg
-            {CB::c_intermed2, 1, intermed_cb_format},  // tmp_exp_avg_sq
-            {CB::c_intermed3, 1, intermed_cb_format},  // tmp_max_exp_avg_sq
-            {CB::c_intermed4, 1, intermed_cb_format},  // beta1_exponent
-            {CB::c_intermed5, 1, intermed_cb_format},  // beta2_exponent
-            {CB::c_intermed6, 1, intermed_cb_format},  // tmp1
-            {CB::c_intermed7, 1, intermed_cb_format},  // tmp2
-
-            {CB::c_out0, 1},  // param_out
-            {CB::c_out1, 1},  // exp_avg_out
-            {CB::c_out2, 1},  // exp_avg_sq_out
-            {CB::c_out3, 1},  // max_exp_avg_sq_out (optional)
+            {CBIndex::c_0, 1},                      // param_in
+            {CBIndex::c_1, 1},                      // grad
+            {CBIndex::c_2, 1},                      // exp_avg_in
+            {CBIndex::c_3, 1},                      // exp_avg_sq_in
+            {CBIndex::c_4, 1},                      // max_exp_avg_sq_in (optional)
+            {CBIndex::c_5, 5, intermed_cb_format},  // lr, beta1, beta2, eps, weight_decay
+            {CBIndex::c_6, 1, intermed_cb_format},  // 1.0f
+
+            {CBIndex::c_24, 1, intermed_cb_format},  // tmp_grad
+            {CBIndex::c_25, 1, intermed_cb_format},  // tmp_exp_avg
+            {CBIndex::c_26, 1, intermed_cb_format},  // tmp_exp_avg_sq
+            {CBIndex::c_27, 1, intermed_cb_format},  // tmp_max_exp_avg_sq
+            {CBIndex::c_28, 1, intermed_cb_format},  // beta1_exponent
+            {CBIndex::c_29, 1, intermed_cb_format},  // beta2_exponent
+            {CBIndex::c_30, 1, intermed_cb_format},  // tmp1
+            {CBIndex::c_31, 1, intermed_cb_format},  // tmp2
+
+            {CBIndex::c_16, 1},  // param_out
+            {CBIndex::c_17, 1},  // exp_avg_out
+            {CBIndex::c_18, 1},  // exp_avg_sq_out
+            {CBIndex::c_19, 1},  // max_exp_avg_sq_out (optional)
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/moreh_adamw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/moreh_adamw.cpp
index 958d30574f6..40d94dd5488 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/moreh_adamw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_adamw/moreh_adamw.cpp
@@ -6,6 +6,8 @@
 
 #include "ttnn/operations/moreh/moreh_adamw/device/moreh_adamw_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_adamw {
 
 std::vector<std::optional<Tensor>> MorehAdamw::invoke(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/kernels/writer_moreh_arange.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/kernels/writer_moreh_arange.cpp
index 3b4118ccccb..da5057cb47e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/kernels/writer_moreh_arange.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/kernels/writer_moreh_arange.cpp
@@ -15,7 +15,7 @@ void kernel_main() {
     uint32_t start = get_arg_val<uint32_t>(3);
     uint32_t step = get_arg_val<uint32_t>(4);
 
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
 
     constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
     uint32_t num_bytes_per_tile = get_tile_size(cb_out);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/kernels/writer_moreh_arange_rm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/kernels/writer_moreh_arange_rm.cpp
index 16756f34102..450c163fbeb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/kernels/writer_moreh_arange_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/kernels/writer_moreh_arange_rm.cpp
@@ -16,7 +16,7 @@ void kernel_main() {
     uint32_t step = get_arg_val<uint32_t>(4);
     uint32_t element_size = get_arg_val<uint32_t>(5);
 
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
 
     constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
     uint32_t num_bytes_per_tile = TILE_WIDTH * element_size;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_program_factory.cpp
index 73403bee5ee..4292a7ed111 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_arange/device/moreh_arange_program_factory.cpp
@@ -31,7 +31,7 @@ MorehArangeOperation::ProgramFactory::cached_program_t MorehArangeOperation::Pro
         all_cores,
         tt::tt_metal::datatype_to_dataformat_converter(dtype),
         {
-            {tt::CB::c_out0, 1},
+            {tt::CBIndex::c_16, 1},
         });
 
     // Create write kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_program_factory.cpp
index d51e19e546f..39e368b693f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step1/device/moreh_clip_grad_norm_step1_program_factory.cpp
@@ -93,17 +93,17 @@ MorehClipGradNormStep1Operation::ProgramFactory::create(
         core_group_1,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},        // input(==x)
-            {tt::CB::c_in1, in1_t},        // one
-            {tt::CB::c_in2, in2_t},        // decimal
-            {tt::CB::c_in3, in3_t},        // mask_h_w
-            {tt::CB::c_out0, out0_t},      // output(==y)
-            {tt::CB::c_intermed0, im0_t},  // |x|
-            {tt::CB::c_intermed1, im1_t},  // |x|^p
-            {tt::CB::c_intermed2, im2_t},  // Add[|x|^p * exp(log(|x|) * decimal)]
-            {tt::CB::c_intermed3, im3_t},  // log(|x|)
-            {tt::CB::c_intermed4, im4_t},  // exp(log(|x|) * decimal)
-            {tt::CB::c_intermed5, im5_t},  // |x|^p * exp(log(|x|) * decimal)
+            {tt::CBIndex::c_0, in0_t},        // input(==x)
+            {tt::CBIndex::c_1, in1_t},        // one
+            {tt::CBIndex::c_2, in2_t},        // decimal
+            {tt::CBIndex::c_3, in3_t},        // mask_h_w
+            {tt::CBIndex::c_16, out0_t},      // output(==y)
+            {tt::CBIndex::c_24, im0_t},  // |x|
+            {tt::CBIndex::c_25, im1_t},  // |x|^p
+            {tt::CBIndex::c_26, im2_t},  // Add[|x|^p * exp(log(|x|) * decimal)]
+            {tt::CBIndex::c_27, im3_t},  // log(|x|)
+            {tt::CBIndex::c_28, im4_t},  // exp(log(|x|) * decimal)
+            {tt::CBIndex::c_29, im5_t},  // |x|^p * exp(log(|x|) * decimal)
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_program_factory.cpp
index 03427d636ed..31dae574b1b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step2/device/moreh_clip_grad_norm_step2_program_factory.cpp
@@ -68,13 +68,13 @@ MorehClipGradNormStep2Operation::ProgramFactory::create(
         single_core,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},        // input(==tmp_pow_sum)
-            {tt::CB::c_in1, in1_t},        // decimal
-            {tt::CB::c_out0, out0_t},      // output(==total_norm)
-            {tt::CB::c_intermed0, im0_t},  // Sum[tmp_pow_sum](==x)
-            {tt::CB::c_intermed1, im1_t},  // x^p
-            {tt::CB::c_intermed2, im2_t},  // log(x)
-            {tt::CB::c_intermed3, im3_t},  // exp(log(x) * decimal)
+            {tt::CBIndex::c_0, in0_t},        // input(==tmp_pow_sum)
+            {tt::CBIndex::c_1, in1_t},        // decimal
+            {tt::CBIndex::c_16, out0_t},      // output(==total_norm)
+            {tt::CBIndex::c_24, im0_t},  // Sum[tmp_pow_sum](==x)
+            {tt::CBIndex::c_25, im1_t},  // x^p
+            {tt::CBIndex::c_26, im2_t},  // log(x)
+            {tt::CBIndex::c_27, im3_t},  // exp(log(x) * decimal)
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_program_factory.cpp
index 4b20990ce63..f78976e6c6e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_clip_grad_norm/moreh_clip_grad_norm_step3/device/moreh_clip_grad_norm_step3_program_factory.cpp
@@ -73,9 +73,9 @@ MorehClipGradNormStep3Operation::ProgramFactory::create(
         core_group_1,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},    // input(inplace)
-            {tt::CB::c_in1, in1_t},    // clip_coef_clamped
-            {tt::CB::c_out0, out0_t},  // output(inplace)
+            {tt::CBIndex::c_0, in0_t},    // input(inplace)
+            {tt::CBIndex::c_1, in1_t},    // clip_coef_clamped
+            {tt::CBIndex::c_16, out0_t},  // output(inplace)
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/moreh_cumsum_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/moreh_cumsum_nc.cpp
index 093a64af336..147d94dd1ad 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/moreh_cumsum_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/moreh_cumsum_nc.cpp
@@ -15,10 +15,10 @@ void MAIN {
     const auto num_tiles_to_cumsum = get_arg_val<uint32_t>(0);
     const auto num_output_tiles_per_core = get_arg_val<uint32_t>(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t first_tile = 0;
@@ -51,7 +51,7 @@ void MAIN {
             ACQ();
             cb_wait_front(cb_intermed0, onetile);
             copy_tile_to_dst_init_short();
-            copy_tile(tt::CB::c_intermed0, first_tile, dst0);
+            copy_tile(tt::CBIndex::c_24, first_tile, dst0);
             cb_reserve_back(cb_out0, onetile);
             pack_tile(dst0, cb_out0);
             cb_push_back(cb_out0, onetile);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/utils.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/utils.hpp
deleted file mode 100644
index c50d0254e34..00000000000
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/kernels/utils.hpp
+++ /dev/null
@@ -1,298 +0,0 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <stdint.h>
-
-#include "dataflow_api.h"
-
-void fill_cb_with_value(uint32_t cb_id, uint32_t value, int32_t num_of_elems = 1024) {
-    cb_reserve_back(cb_id, 1);
-    auto ptr = reinterpret_cast<uint16_t *>(get_write_ptr(cb_id));
-    for (int j = 0; j < num_of_elems; j++) {
-        ptr[j] = uint16_t(value >> 16);
-    }
-    cb_push_back(cb_id, 1);
-}
-
-void generate_mask_h_w(uint32_t cb_mask_h_w, uint32_t mask_h, uint32_t mask_w, uint32_t single_tile_size = 2048) {
-    union {
-        float f;
-        uint32_t u;
-    } one;
-    one.f = 1.0f;
-    union {
-        float f;
-        uint32_t u;
-    } zero;
-    zero.f = 0.0f;
-
-    const auto u16_one = uint16_t(one.u >> 16);
-    const auto u16_zero = uint16_t(zero.u >> 16);
-
-    cb_reserve_back(cb_mask_h_w, 2);
-
-    // mask_h
-    // first tile ptr
-    auto mask_h_ptr = reinterpret_cast<uint16_t *>(get_write_ptr(cb_mask_h_w));
-    for (uint32_t w = 0; w < 16; w++) {
-        // sub tile 0
-        {
-            uint32_t mask_h_0 = mask_h;
-            if (mask_h_0 >= 16) {
-                mask_h_0 = 16;
-            }
-            uint32_t h = 0;
-            for (; h < mask_h_0; h++) {
-                mask_h_ptr[h * 16 + w] = u16_one;
-            }
-            for (; h < 16; h++) {
-                mask_h_ptr[h * 16 + w] = u16_zero;
-            }
-        }
-
-        // sub tile 1
-        {
-            uint32_t mask_h_0 = mask_h;
-            if (mask_h_0 >= 16) {
-                mask_h_0 = 16;
-            }
-            uint32_t h = 0;
-            for (; h < mask_h_0; h++) {
-                mask_h_ptr[h * 16 + w + 256] = u16_one;
-            }
-            for (; h < 16; h++) {
-                mask_h_ptr[h * 16 + w + 256] = u16_zero;
-            }
-        }
-
-        // sub tile 2
-        {
-            uint32_t mask_h_1 = (mask_h < 16) ? 0 : mask_h - 16;
-            uint32_t h = 0;
-            for (; h < mask_h_1; h++) {
-                mask_h_ptr[h * 16 + w + 512] = u16_one;
-            }
-            for (; h < 16; h++) {
-                mask_h_ptr[h * 16 + w + 512] = u16_zero;
-            }
-        }
-
-        // sub tile 3
-        {
-            uint32_t mask_h_1 = (mask_h < 16) ? 0 : mask_h - 16;
-            uint32_t h = 0;
-            for (; h < mask_h_1; h++) {
-                mask_h_ptr[h * 16 + w + 768] = u16_one;
-            }
-            for (; h < 16; h++) {
-                mask_h_ptr[h * 16 + w + 768] = u16_zero;
-            }
-        }
-    }
-
-    // mask_w
-    // second tile ptr
-    auto mask_w_ptr = reinterpret_cast<uint16_t *>(get_write_ptr(cb_mask_h_w) + single_tile_size);
-    for (uint32_t h = 0; h < 16; h++) {
-        // sub tile 0
-        {
-            uint32_t mask_w_0 = mask_w;
-            if (mask_w_0 >= 16) {
-                mask_w_0 = 16;
-            }
-            uint32_t w = 0;
-            for (; w < mask_w_0; w++) {
-                mask_w_ptr[h * 16 + w] = u16_one;
-            }
-            for (; w < 16; w++) {
-                mask_w_ptr[h * 16 + w] = u16_zero;
-            }
-        }
-
-        // sub tile 1
-        {
-            uint32_t mask_w_1 = (mask_w < 16) ? 0 : mask_w - 16;
-            uint32_t w = 0;
-            for (; w < mask_w_1; w++) {
-                mask_w_ptr[h * 16 + w + 256] = u16_one;
-            }
-            for (; w < 16; w++) {
-                mask_w_ptr[h * 16 + w + 256] = u16_zero;
-            }
-        }
-
-        // sub tile 2
-        {
-            uint32_t mask_w_0 = mask_w;
-            if (mask_w_0 >= 16) {
-                mask_w_0 = 16;
-            }
-            uint32_t w = 0;
-            for (; w < mask_w_0; w++) {
-                mask_w_ptr[h * 16 + w + 512] = u16_one;
-            }
-            for (; w < 16; w++) {
-                mask_w_ptr[h * 16 + w + 512] = u16_zero;
-            }
-        }
-
-        // sub tile 3
-        {
-            uint32_t mask_w_1 = (mask_w < 16) ? 0 : mask_w - 16;
-            uint32_t w = 0;
-            for (; w < mask_w_1; w++) {
-                mask_w_ptr[h * 16 + w + 768] = u16_one;
-            }
-            for (; w < 16; w++) {
-                mask_w_ptr[h * 16 + w + 768] = u16_zero;
-            }
-        }
-    }
-
-    cb_push_back(cb_mask_h_w, 2);
-}
-
-void generate_mask_w(uint32_t cb_mask, uint32_t mask_w) {
-    union {
-        float f;
-        uint32_t u;
-    } one;
-    one.f = 1.0f;
-    union {
-        float f;
-        uint32_t u;
-    } zero;
-    zero.f = 0.0f;
-
-    cb_reserve_back(cb_mask, 1);
-    auto ptr = reinterpret_cast<uint16_t *>(get_write_ptr(cb_mask));
-
-    for (uint32_t h = 0; h < 16; h++) {
-        // sub tile 0
-        {
-            uint32_t mask_w_0 = mask_w;
-            if (mask_w_0 >= 16)
-                mask_w_0 = 16;
-            uint32_t w = 0;
-            for (; w < mask_w_0; w++) {
-                ptr[h * 16 + w] = uint16_t(one.u >> 16);
-            }
-            for (; w < 16; w++) {
-                ptr[h * 16 + w] = uint16_t(zero.u >> 16);
-            }
-        }
-
-        // sub tile 1
-        {
-            uint32_t mask_w_1 = (mask_w < 16) ? 0 : mask_w - 16;
-            uint32_t w = 0;
-            for (; w < mask_w_1; w++) {
-                ptr[h * 16 + w + 256] = uint16_t(one.u >> 16);
-            }
-            for (; w < 16; w++) {
-                ptr[h * 16 + w + 256] = uint16_t(zero.u >> 16);
-            }
-        }
-
-        // sub tile 2
-        {
-            uint32_t mask_w_0 = mask_w;
-            if (mask_w_0 >= 16)
-                mask_w_0 = 16;
-            uint32_t w = 0;
-            for (; w < mask_w_0; w++) {
-                ptr[h * 16 + w + 512] = uint16_t(one.u >> 16);
-            }
-            for (; w < 16; w++) {
-                ptr[h * 16 + w + 512] = uint16_t(zero.u >> 16);
-            }
-        }
-
-        // sub tile 3
-        {
-            uint32_t mask_w_1 = (mask_w < 16) ? 0 : mask_w - 16;
-            uint32_t w = 0;
-            for (; w < mask_w_1; w++) {
-                ptr[h * 16 + w + 768] = uint16_t(one.u >> 16);
-            }
-            for (; w < 16; w++) {
-                ptr[h * 16 + w + 768] = uint16_t(zero.u >> 16);
-            }
-        }
-    }
-
-    cb_push_back(cb_mask, 1);
-}
-
-void generate_mask_h(uint32_t cb_mask, uint32_t mask_h) {
-    union {
-        float f;
-        uint32_t u;
-    } one;
-    one.f = 1.0f;
-    union {
-        float f;
-        uint32_t u;
-    } zero;
-    zero.f = 0.0f;
-
-    cb_reserve_back(cb_mask, 1);
-    auto ptr = reinterpret_cast<uint16_t *>(get_write_ptr(cb_mask));
-
-    for (uint32_t w = 0; w < 16; w++) {
-        // sub tile 0
-        {
-            uint32_t mask_h_0 = mask_h;
-            if (mask_h_0 >= 16)
-                mask_h_0 = 16;
-            uint32_t h = 0;
-            for (; h < mask_h_0; h++) {
-                ptr[h * 16 + w] = uint16_t(one.u >> 16);
-            }
-            for (; h < 16; h++) {
-                ptr[h * 16 + w] = uint16_t(zero.u >> 16);
-            }
-        }
-
-        // sub tile 1
-        {
-            uint32_t mask_h_0 = mask_h;
-            if (mask_h_0 >= 16)
-                mask_h_0 = 16;
-            uint32_t h = 0;
-            for (; h < mask_h_0; h++) {
-                ptr[h * 16 + w + 256] = uint16_t(one.u >> 16);
-            }
-            for (; h < 16; h++) {
-                ptr[h * 16 + w + 256] = uint16_t(zero.u >> 16);
-            }
-        }
-
-        // sub tile 2
-        {
-            uint32_t mask_h_1 = (mask_h < 16) ? 0 : mask_h - 16;
-            uint32_t h = 0;
-            for (; h < mask_h_1; h++) {
-                ptr[h * 16 + w + 512] = uint16_t(one.u >> 16);
-            }
-            for (; h < 16; h++) {
-                ptr[h * 16 + w + 512] = uint16_t(zero.u >> 16);
-            }
-        }
-
-        // sub tile 3
-        {
-            uint32_t mask_h_1 = (mask_h < 16) ? 0 : mask_h - 16;
-            uint32_t h = 0;
-            for (; h < mask_h_1; h++) {
-                ptr[h * 16 + w + 768] = uint16_t(one.u >> 16);
-            }
-            for (; h < 16; h++) {
-                ptr[h * 16 + w + 768] = uint16_t(zero.u >> 16);
-            }
-        }
-    }
-
-    cb_push_back(cb_mask, 1);
-}
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_program_factory.cpp
index 0ab3778c95a..69c1317d5d9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_cumsum/device/moreh_cumsum_program_factory.cpp
@@ -82,10 +82,10 @@ MorehCumsumDeviceOperation::ProgramFactory::cached_program_t MorehCumsumDeviceOp
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},              // input
-            {tt::CB::c_in1, in1_t},              // zero
-            {tt::CB::c_intermed0, intermed0_t},  // accumulated sum
-            {tt::CB::c_out0, out0_t},            // output
+            {tt::CBIndex::c_0, in0_t},              // input
+            {tt::CBIndex::c_1, in1_t},              // zero
+            {tt::CBIndex::c_24, intermed0_t},  // accumulated sum
+            {tt::CBIndex::c_16, out0_t},            // output
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/kernels/moreh_dot.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/kernels/moreh_dot.cpp
index a9c726d9bd7..8ae2db0400f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/kernels/moreh_dot.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/kernels/moreh_dot.cpp
@@ -15,51 +15,51 @@ namespace NAMESPACE {
 void MAIN {
     constexpr int onetile = 1;
     uint32_t per_core_block_cnt = get_arg_val<uint32_t>(0);
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_1);
     bool enable_reload = false;
     for (uint32_t block = 0; block < per_core_block_cnt; ++block) {
         bool last_out = block == (per_core_block_cnt - 1);
 
         // elemwise-mul
         ACQ();
-        cb_wait_front(tt::CB::c_in0, onetile);
-        cb_wait_front(tt::CB::c_in1, onetile);
+        cb_wait_front(tt::CBIndex::c_0, onetile);
+        cb_wait_front(tt::CBIndex::c_1, onetile);
 
-        cb_reserve_back(tt::CB::c_intermed0, onetile);
+        cb_reserve_back(tt::CBIndex::c_24, onetile);
         mul_tiles_init();
         // dst0 = c_in0 x c_in1
-        mul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0);
+        mul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0);
         // c_intermed0 = pack(dst0)
-        pack_tile(0, tt::CB::c_intermed0);
-        cb_push_back(tt::CB::c_intermed0, onetile);
+        pack_tile(0, tt::CBIndex::c_24);
+        cb_push_back(tt::CBIndex::c_24, onetile);
 
-        cb_pop_front(tt::CB::c_in0, onetile);
-        cb_pop_front(tt::CB::c_in1, onetile);
+        cb_pop_front(tt::CBIndex::c_0, onetile);
+        cb_pop_front(tt::CBIndex::c_1, onetile);
         REL();
 
         // reduce-w
         ACQ();
         if (enable_reload) {
-            cb_wait_front(tt::CB::c_intermed1, onetile);
+            cb_wait_front(tt::CBIndex::c_25, onetile);
             copy_tile_to_dst_init_short();
-            copy_tile(tt::CB::c_intermed1, 0, 0);
-            cb_pop_front(tt::CB::c_intermed1, onetile);
+            copy_tile(tt::CBIndex::c_25, 0, 0);
+            cb_pop_front(tt::CBIndex::c_25, onetile);
         }
 
-        cb_wait_front(tt::CB::c_intermed0, onetile);
+        cb_wait_front(tt::CBIndex::c_24, onetile);
         reduce_init_delta<false>();
-        reduce_tile(tt::CB::c_intermed0, tt::CB::c_in2, 0, 0, 0);
-        cb_pop_front(tt::CB::c_intermed0, onetile);
+        reduce_tile(tt::CBIndex::c_24, tt::CBIndex::c_2, 0, 0, 0);
+        cb_pop_front(tt::CBIndex::c_24, onetile);
         reduce_revert_delta();
 
         if (last_out) {
-            cb_reserve_back(tt::CB::c_out0, onetile);
-            pack_tile(0, tt::CB::c_out0);
-            cb_push_back(tt::CB::c_out0, onetile);
+            cb_reserve_back(tt::CBIndex::c_16, onetile);
+            pack_tile(0, tt::CBIndex::c_16);
+            cb_push_back(tt::CBIndex::c_16, onetile);
         } else {
-            cb_reserve_back(tt::CB::c_intermed1, onetile);
-            pack_tile(0, tt::CB::c_intermed1);
-            cb_push_back(tt::CB::c_intermed1, onetile);
+            cb_reserve_back(tt::CBIndex::c_25, onetile);
+            pack_tile(0, tt::CBIndex::c_25);
+            cb_push_back(tt::CBIndex::c_25, onetile);
         }
         REL();
         enable_reload = true;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
index 78a7ee2664a..01cc72f838a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/moreh_dot_program_factory.cpp
@@ -58,12 +58,12 @@ MorehDotOperation::SingleCore::cached_program_t MorehDotOperation::SingleCore::c
         std::set<CoreRange>{CoreRange(core, core)},
         cb_data_format,
         {
-            {CB::c_in0, in0_t},
-            {CB::c_in1, in1_t},
-            {CB::c_in2, in2_t},
-            {CB::c_out0, out0_t},
-            {CB::c_intermed0, im0_t},
-            {CB::c_intermed1, im1_t},
+            {CBIndex::c_0, in0_t},
+            {CBIndex::c_1, in1_t},
+            {CBIndex::c_2, in2_t},
+            {CBIndex::c_16, out0_t},
+            {CBIndex::c_24, im0_t},
+            {CBIndex::c_25, im1_t},
         });
 
     std::vector<uint32_t> reader_compile_time_args = {
@@ -72,7 +72,7 @@ MorehDotOperation::SingleCore::cached_program_t MorehDotOperation::SingleCore::c
         *reinterpret_cast<uint32_t*>(&scaler)};
 
     std::vector<uint32_t> writer_compile_time_args = {
-        (std::uint32_t)CB::c_out0, (std::uint32_t)is_dram(dst_buffer)};
+        (std::uint32_t)CBIndex::c_16, (std::uint32_t)is_dram(dst_buffer)};
     const auto reader_kernel_file = "ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/kernels/reader_moreh_dot.cpp";
     const auto writer_kernel_file = "ttnn/cpp/ttnn/operations/moreh/moreh_dot/device/kernels/writer_moreh_dot.cpp";
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/kernels/moreh_dot_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/kernels/moreh_dot_backward.cpp
index a8c44b23929..397b8470db8 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/kernels/moreh_dot_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/kernels/moreh_dot_backward.cpp
@@ -14,26 +14,26 @@ void MAIN {
     uint32_t has_other_grad = get_arg_val<uint32_t>(1);
     uint32_t per_core_block_cnt = get_arg_val<uint32_t>(2);
 
-    init_bcast<ELWMUL, BroadcastType::SCALAR>(tt::CB::c_in2, tt::CB::c_in0);
-    cb_wait_front(tt::CB::c_in0, onetile);
+    init_bcast<ELWMUL, BroadcastType::SCALAR>(tt::CBIndex::c_2, tt::CBIndex::c_0);
+    cb_wait_front(tt::CBIndex::c_0, onetile);
     for (uint32_t block = 0; block < per_core_block_cnt; ++block) {
         if (has_input_grad) {
-            cb_wait_front(tt::CB::c_in2, onetile);
+            cb_wait_front(tt::CBIndex::c_2, onetile);
             ACQ();
-            mul_tiles_bcast<BroadcastType::SCALAR>(tt::CB::c_in2, tt::CB::c_in0, 0, 0, 0);
-            pack_tile(0, tt::CB::c_out0);
-            cb_push_back(tt::CB::c_out0, onetile);
-            cb_pop_front(tt::CB::c_in2, onetile);
+            mul_tiles_bcast<BroadcastType::SCALAR>(tt::CBIndex::c_2, tt::CBIndex::c_0, 0, 0, 0);
+            pack_tile(0, tt::CBIndex::c_16);
+            cb_push_back(tt::CBIndex::c_16, onetile);
+            cb_pop_front(tt::CBIndex::c_2, onetile);
             REL();
         }
 
         if (has_other_grad) {
-            cb_wait_front(tt::CB::c_in1, onetile);
+            cb_wait_front(tt::CBIndex::c_1, onetile);
             ACQ();
-            mul_tiles_bcast<BroadcastType::SCALAR>(tt::CB::c_in1, tt::CB::c_in0, 0, 0, 0);
-            pack_tile(0, tt::CB::c_out1);
-            cb_push_back(tt::CB::c_out1, onetile);
-            cb_pop_front(tt::CB::c_in1, onetile);
+            mul_tiles_bcast<BroadcastType::SCALAR>(tt::CBIndex::c_1, tt::CBIndex::c_0, 0, 0, 0);
+            pack_tile(0, tt::CBIndex::c_17);
+            cb_push_back(tt::CBIndex::c_17, onetile);
+            cb_pop_front(tt::CBIndex::c_1, onetile);
             REL();
         }
     }
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
index 9c79145f8a8..4f2faf246e5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_program_factory.cpp
@@ -53,11 +53,11 @@ MorehDotBackwardOperation::SingleCore::cached_program_t MorehDotBackwardOperatio
         std::set<CoreRange>{CoreRange(core, core)},
         cb_data_format,
         {
-            {CB::c_in0, in0_t},
-            {CB::c_in1, in1_t},
-            {CB::c_in2, in2_t},
-            {CB::c_out0, out0_t},
-            {CB::c_out1, out1_t},
+            {CBIndex::c_0, in0_t},
+            {CBIndex::c_1, in1_t},
+            {CBIndex::c_2, in2_t},
+            {CBIndex::c_16, out0_t},
+            {CBIndex::c_17, out1_t},
         });
     bool has_input_grad = input_grad.has_value();
     bool has_other_grad = other_grad.has_value();
@@ -89,8 +89,8 @@ MorehDotBackwardOperation::SingleCore::cached_program_t MorehDotBackwardOperatio
     }
 
     std::vector<uint32_t> writer_compile_time_args = {
-        (std::uint32_t)CB::c_out0,
-        (std::uint32_t)CB::c_out1,
+        (std::uint32_t)CBIndex::c_16,
+        (std::uint32_t)CBIndex::c_17,
         (std::uint32_t)dst0_is_dram,
         (std::uint32_t)dst1_is_dram,
     };
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/moreh_dot_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/moreh_dot_backward.cpp
index 9f30eccc970..e6163aceebb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/moreh_dot_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_dot_backward/moreh_dot_backward.cpp
@@ -6,6 +6,8 @@
 
 #include "ttnn/operations/moreh/moreh_dot_backward/device/moreh_dot_backward_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_dot_backward {
 std::vector<std::optional<Tensor>> MorehDotBackward::invoke(
     const Tensor &output_grad,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_program_factory_rm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_program_factory_rm.cpp
index 453bcded4b1..291304d5689 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_program_factory_rm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_fold/device/fold_program_factory_rm.cpp
@@ -74,8 +74,8 @@ MorehFoldOperation::ProgramFactory::cached_program_t MorehFoldOperation::Program
     uint32_t aligned_input_cb_page_size = round_up_to_mul32(input_cb_page_size);
     uint32_t aligned_output_cb_page_size = round_up_to_mul32(output_cb_page_size);
 
-    uint32_t input_cb_index = tt::CB::c_in0;    // input
-    uint32_t output_cb_index = tt::CB::c_out0;  // ouput
+    uint32_t input_cb_index = tt::CBIndex::c_0;    // input
+    uint32_t output_cb_index = tt::CBIndex::c_16;  // ouput
 
     CircularBufferConfig input_cb_config =
         CircularBufferConfig(aligned_input_cb_page_size * 2, {{input_cb_index, data_format}})
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_kernels/reader_moreh_getitem.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_kernels/reader_moreh_getitem.cpp
index 4090cf199ee..8e4589df5a3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_kernels/reader_moreh_getitem.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_kernels/reader_moreh_getitem.cpp
@@ -53,12 +53,12 @@ void kernel_main() {
     uint32_t num_sticks = get_arg_val<uint32_t>(i++);
     uint32_t stick_size = get_arg_val<uint32_t>(i++);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_in2 = tt::CB::c_in2;
-    constexpr auto cb_in3 = tt::CB::c_in3;
-    constexpr auto cb_in4 = tt::CB::c_in4;
-    constexpr auto cb_in5 = tt::CB::c_in5;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_in2 = tt::CBIndex::c_2;
+    constexpr auto cb_in3 = tt::CBIndex::c_3;
+    constexpr auto cb_in4 = tt::CBIndex::c_4;
+    constexpr auto cb_in5 = tt::CBIndex::c_5;
 
     constexpr bool in_is_dram = get_compile_time_arg_val(0) == 1;
     constexpr bool index0_is_dram = get_compile_time_arg_val(1) == 1;
@@ -88,7 +88,7 @@ void kernel_main() {
         index4_is_defined,
     };
 
-    tt::CB index_cbs[5] = {
+    tt::CBIndex index_cbs[5] = {
         cb_in1,
         cb_in2,
         cb_in3,
@@ -141,7 +141,7 @@ void kernel_main() {
 
             if (index_is_defined[dim]) {
                 // read index tensor
-                tt::CB idx_cb = index_cbs[dim];
+                tt::CBIndex idx_cb = index_cbs[dim];
 
                 cb_reserve_back(idx_cb, 1);
                 uint32_t index_l1_addr = get_write_ptr(idx_cb);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_kernels/writer_moreh_getitem.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_kernels/writer_moreh_getitem.cpp
index 62914e520ae..c940f7652dc 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_kernels/writer_moreh_getitem.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_kernels/writer_moreh_getitem.cpp
@@ -16,7 +16,7 @@ void kernel_main() {
     uint32_t start_id = get_arg_val<uint32_t>(i++);
     uint32_t num_sticks = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_in0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_0;
 
     constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
index 266896792e8..f2cdf652e77 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_rm_factory.cpp
@@ -96,7 +96,7 @@ MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOpera
     auto index_cb_data_format = datatype_to_dataformat_converter(index_tensors[0].get_dtype());
     auto output_cb_data_format = datatype_to_dataformat_converter(output.get_dtype());
 
-    auto src_cb_index = CB::c_in0;
+    auto src_cb_index = CBIndex::c_0;
     auto rounded_input_page_size = round_up_to_mul32(input_unit_size);
     auto cb_src0_config = CircularBufferConfig(rounded_input_page_size, {{src_cb_index, src_cb_data_format}})
                               .set_page_size(src_cb_index, rounded_input_page_size);
@@ -106,14 +106,14 @@ MorehGetItemOperation::MorehGetItemRmFactory::cached_program_t MorehGetItemOpera
         if (!index_info[dim].is_defined)
             continue;
 
-        auto src1_cb_index = CB::c_in1 + dim;
+        auto src1_cb_index = CBIndex::c_1 + dim;
         auto index_page_size = round_up_to_mul32(index_info[dim].unit_size);
         auto cb_index_config = CircularBufferConfig(index_page_size, {{src1_cb_index, index_cb_data_format}})
                                    .set_page_size(src1_cb_index, index_page_size);
         auto cb_src1 = CreateCircularBuffer(program, all_cores, cb_index_config);
     }
 
-    auto out_cb_index = CB::c_out0;
+    auto out_cb_index = CBIndex::c_16;
     auto rounded_output_page_size = round_up_to_mul32(input_unit_size);
     auto cb_out0_config = CircularBufferConfig(rounded_input_page_size, {{out_cb_index, output_cb_data_format}})
                               .set_page_size(out_cb_index, rounded_input_page_size);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
index b9313d8e3e7..6928805d042 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_factory.cpp
@@ -124,7 +124,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
         auto index_cb_data_format = datatype_to_dataformat_converter(index_tensors[0].get_dtype());
         auto output_cb_data_format = datatype_to_dataformat_converter(output.get_dtype());
 
-        auto src_cb_index = CB::c_in0;
+        auto src_cb_index = CBIndex::c_0;
         auto rounded_input_page_size = round_up_to_mul32(input_unit_size);
         auto cb_src0_config = CircularBufferConfig(rounded_input_page_size, {{src_cb_index, src_cb_data_format}})
                                   .set_page_size(src_cb_index, rounded_input_page_size);
@@ -134,20 +134,20 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
             if (!index_info[dim].is_defined)
                 continue;
 
-            auto src1_cb_index = CB::c_in1 + dim;
+            auto src1_cb_index = CBIndex::c_1 + dim;
             auto index_page_size = 1024 * 4;
             auto cb_index_config = CircularBufferConfig(index_page_size, {{src1_cb_index, index_cb_data_format}})
                                        .set_page_size(src1_cb_index, index_page_size);
             auto cb_src1 = CreateCircularBuffer(program, all_cores, cb_index_config);
         }
 
-        auto out_cb0_index = CB::c_out0;
+        auto out_cb0_index = CBIndex::c_16;
         auto rounded_output_page_size = round_up_to_mul32(output_unit_size);
         auto cb_out0_config = CircularBufferConfig(rounded_output_page_size, {{out_cb0_index, output_cb_data_format}})
                                   .set_page_size(out_cb0_index, rounded_output_page_size);
         auto cb_out0 = CreateCircularBuffer(program, all_cores, cb_out0_config);
 
-        auto out_cb1_index = CB::c_out1;
+        auto out_cb1_index = CBIndex::c_17;
         auto cb_out1_config = CircularBufferConfig(rounded_output_page_size, {{out_cb1_index, output_cb_data_format}})
                                   .set_page_size(out_cb1_index, rounded_output_page_size);
         auto cb_out1 = CreateCircularBuffer(program, all_cores, cb_out1_config);
@@ -357,7 +357,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
         auto index_cb_data_format = datatype_to_dataformat_converter(index_tensors[0].get_dtype());
         auto output_cb_data_format = datatype_to_dataformat_converter(output.get_dtype());
 
-        auto src_cb_index = CB::c_in0;
+        auto src_cb_index = CBIndex::c_0;
         auto rounded_input_page_size = round_up_to_mul32(input_unit_size);
         auto cb_src0_config = CircularBufferConfig(rounded_input_page_size, {{src_cb_index, src_cb_data_format}})
                                   .set_page_size(src_cb_index, rounded_input_page_size);
@@ -367,7 +367,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
             if (!index_info[dim].is_defined)
                 continue;
 
-            auto src1_cb_index = CB::c_in1 + dim;
+            auto src1_cb_index = CBIndex::c_1 + dim;
             // auto index_page_size = round_up_to_mul32(index_info[dim].unit_size);
             auto index_page_size = 1024 * 4;
             auto cb_index_config = CircularBufferConfig(index_page_size, {{src1_cb_index, index_cb_data_format}})
@@ -375,7 +375,7 @@ MorehGetItemOperation::MorehGetItemTilizedFactory::create(
             auto cb_src1 = CreateCircularBuffer(program, all_cores, cb_index_config);
         }
 
-        auto out_cb_index = CB::c_out0;
+        auto out_cb_index = CBIndex::c_16;
         auto rounded_output_page_size = round_up_to_mul32(input_unit_size);
         auto cb_out0_config = CircularBufferConfig(rounded_input_page_size, {{out_cb_index, output_cb_data_format}})
                                   .set_page_size(out_cb_index, rounded_input_page_size);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp
index c559f653345..edcf0643deb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/common.hpp
@@ -2,6 +2,8 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma once
+
 #define TILE_HEIGHT 32
 #define TILE_WIDTH 32
 #define FACE_WIDTH 16
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize.cpp
index 516b85e8a2e..dc1de33cdc5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize.cpp
@@ -64,12 +64,12 @@ void kernel_main() {
     uint32_t stick_size = get_arg_val<uint32_t>(i++);
     uint32_t element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_in2 = tt::CB::c_in2;
-    constexpr auto cb_in3 = tt::CB::c_in3;
-    constexpr auto cb_in4 = tt::CB::c_in4;
-    constexpr auto cb_in5 = tt::CB::c_in5;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_in2 = tt::CBIndex::c_2;
+    constexpr auto cb_in3 = tt::CBIndex::c_3;
+    constexpr auto cb_in4 = tt::CBIndex::c_4;
+    constexpr auto cb_in5 = tt::CBIndex::c_5;
 
     constexpr bool in_is_dram = get_compile_time_arg_val(0) == 1;
     constexpr bool index0_is_dram = get_compile_time_arg_val(1) == 1;
@@ -97,7 +97,7 @@ void kernel_main() {
         index4_is_defined,
     };
 
-    tt::CB index_cbs[5] = {
+    tt::CBIndex index_cbs[5] = {
         cb_in1,
         cb_in2,
         cb_in3,
@@ -145,7 +145,7 @@ void kernel_main() {
             uint32_t input_stick_idx_stride = input_stick_idx_strides[dim];
 
             if (index_is_defined[dim]) {
-                tt::CB idx_cb = index_cbs[dim];
+                tt::CBIndex idx_cb = index_cbs[dim];
 
                 cb_reserve_back(idx_cb, 1);
                 uint32_t index_l1_addr = get_write_ptr(idx_cb);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize_w.cpp
index 6b53d417944..e3f36211368 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/reader_moreh_getitem_tilize_w.cpp
@@ -64,12 +64,12 @@ void kernel_main() {
     uint32_t num_elements_per_alignment = get_arg_val<uint32_t>(i++);
     uint32_t num_alignment_width = get_arg_val<uint32_t>(i++);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_in2 = tt::CB::c_in2;
-    constexpr auto cb_in3 = tt::CB::c_in3;
-    constexpr auto cb_in4 = tt::CB::c_in4;
-    constexpr auto cb_in5 = tt::CB::c_in5;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_in2 = tt::CBIndex::c_2;
+    constexpr auto cb_in3 = tt::CBIndex::c_3;
+    constexpr auto cb_in4 = tt::CBIndex::c_4;
+    constexpr auto cb_in5 = tt::CBIndex::c_5;
 
     constexpr bool in_is_dram = get_compile_time_arg_val(0) == 1;
     constexpr bool index0_is_dram = get_compile_time_arg_val(1) == 1;
@@ -97,7 +97,7 @@ void kernel_main() {
         index4_is_defined,
     };
 
-    tt::CB index_cbs[5] = {
+    tt::CBIndex index_cbs[5] = {
         cb_in1,
         cb_in2,
         cb_in3,
@@ -154,7 +154,7 @@ void kernel_main() {
 
                 if (index_is_defined[dim]) {
                     // read index tensor
-                    tt::CB idx_cb = index_cbs[dim];
+                    tt::CBIndex idx_cb = index_cbs[dim];
 
                     cb_reserve_back(idx_cb, 1);
                     uint32_t index_l1_addr = get_write_ptr(idx_cb);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize.cpp
index 40fa6eb6486..6b73b5d4a03 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize.cpp
@@ -27,7 +27,7 @@ void kernel_main() {
     uint32_t stick_size = get_arg_val<uint32_t>(i++);
     uint32_t element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_in0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_0;
 
     constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize_w.cpp
index 728d8ece8a8..a167dbc098e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_getitem/device/moreh_getitem_tilized_kernels/writer_moreh_getitem_tilize_w.cpp
@@ -29,8 +29,8 @@ void kernel_main() {
     uint32_t num_elements_per_alignment = get_arg_val<uint32_t>(i++);
     uint32_t num_alignment_width = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_id_out0 = tt::CB::c_in0;
-    constexpr uint32_t cb_id_out1 = tt::CB::c_out1;
+    constexpr uint32_t cb_id_out0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_out1 = tt::CBIndex::c_17;
 
     constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_program_factory.cpp
index 84fc468f179..fa050b08a9a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/device/moreh_group_norm_program_factory.cpp
@@ -152,24 +152,24 @@ MorehGroupNormOperation::MorehGroupNormFactory::cached_program_t MorehGroupNormO
         all_cores,
         cb_data_format,
         {
-            {CB::c_in0, in0_t},        // input
-            {CB::c_in1, in1_t},        // scaler
-            {CB::c_in2, in2_t},        // eps
-            {CB::c_in3, in3_t},        // gamma
-            {CB::c_in4, in4_t},        // beta
-            {CB::c_in5, in5_t},        // mask_h
-            {CB::c_in6, in6_t},        // mask_w
-            {CB::c_out0, out0_t},      // output
-            {CB::c_out1, out1_t},      // mean
-            {CB::c_out2, out2_t},      // rstd
-            {CB::c_intermed0, im0_t},  // E[x]
-            {CB::c_intermed1, im1_t},  // x - E[x]
-            {CB::c_intermed2, im2_t},  // (x - E[x])^2
-            {CB::c_intermed3, im3_t},  // Sum[(x - E[x])^2]
-            {CB::c_intermed4, im4_t},  // E[(x - E[x])^2] = Var[x]
-            {CB::c_intermed5, im5_t},  // 1.0/(sqrt(Var[x] + eps))
-            {CB::c_intermed6, im6_t},  // y * gamm + beta
-            {CB::c_intermed7, im7_t},  // Sum[x]
+            {CBIndex::c_0, in0_t},        // input
+            {CBIndex::c_1, in1_t},        // scaler
+            {CBIndex::c_2, in2_t},        // eps
+            {CBIndex::c_3, in3_t},        // gamma
+            {CBIndex::c_4, in4_t},        // beta
+            {CBIndex::c_5, in5_t},        // mask_h
+            {CBIndex::c_6, in6_t},        // mask_w
+            {CBIndex::c_16, out0_t},      // output
+            {CBIndex::c_17, out1_t},      // mean
+            {CBIndex::c_18, out2_t},      // rstd
+            {CBIndex::c_24, im0_t},  // E[x]
+            {CBIndex::c_25, im1_t},  // x - E[x]
+            {CBIndex::c_26, im2_t},  // (x - E[x])^2
+            {CBIndex::c_27, im3_t},  // Sum[(x - E[x])^2]
+            {CBIndex::c_28, im4_t},  // E[(x - E[x])^2] = Var[x]
+            {CBIndex::c_29, im5_t},  // 1.0/(sqrt(Var[x] + eps))
+            {CBIndex::c_30, im6_t},  // y * gamm + beta
+            {CBIndex::c_31, im7_t},  // Sum[x]
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/moreh_group_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/moreh_group_norm.cpp
index 2efc14400c8..d3f181199c6 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/moreh_group_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm/moreh_group_norm.cpp
@@ -6,6 +6,8 @@
 
 #include "device/moreh_group_norm_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_group_norm {
 std::vector<std::optional<Tensor>> MorehGroupNorm::invoke(
     const Tensor& input,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_factory.cpp
index 16f5d8375c6..3e847d13532 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_factory.cpp
@@ -110,21 +110,21 @@ MorehGroupNormBackwardGammaBetaGradOperation::MorehGroupNormBackwardGammaBetaGra
         all_cores,
         cb_data_format,
         {
-            {CB::c_in0, in0_t},        // output_grad(==dy)
-            {CB::c_in1, in1_t},        // input(==x)
-            {CB::c_in2, in2_t},        // mean
-            {CB::c_in3, in3_t},        // rstd
-            {CB::c_in4, in4_t},        // one
-            {CB::c_in5, in5_t},        // mask_h
-            {CB::c_in6, in6_t},        // mask_w
-            {CB::c_out0, out0_t},      // gamma_grad(==dgamma)
-            {CB::c_out1, out1_t},      // beta_grad(==dbeta)
-            {CB::c_intermed0, im0_t},  // output(==y)
-            {CB::c_intermed1, im1_t},  // y * dy
-            {CB::c_intermed2, im2_t},  // Add[dy]
-            {CB::c_intermed3, im3_t},  // Add[y * dy]
-            {CB::c_intermed4, im4_t},  // x - mean
-            {CB::c_intermed5, im5_t},  // dycopy
+            {CBIndex::c_0, in0_t},        // output_grad(==dy)
+            {CBIndex::c_1, in1_t},        // input(==x)
+            {CBIndex::c_2, in2_t},        // mean
+            {CBIndex::c_3, in3_t},        // rstd
+            {CBIndex::c_4, in4_t},        // one
+            {CBIndex::c_5, in5_t},        // mask_h
+            {CBIndex::c_6, in6_t},        // mask_w
+            {CBIndex::c_16, out0_t},      // gamma_grad(==dgamma)
+            {CBIndex::c_17, out1_t},      // beta_grad(==dbeta)
+            {CBIndex::c_24, im0_t},  // output(==y)
+            {CBIndex::c_25, im1_t},  // y * dy
+            {CBIndex::c_26, im2_t},  // Add[dy]
+            {CBIndex::c_27, im3_t},  // Add[y * dy]
+            {CBIndex::c_28, im4_t},  // x - mean
+            {CBIndex::c_29, im5_t},  // dycopy
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_factory.cpp
index f4b474e55a0..c3bb0eda301 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/device/input_grad/moreh_group_norm_backward_input_grad_factory.cpp
@@ -125,23 +125,23 @@ MorehGroupNormBackwardInputGradOperation::MorehGroupNormBackwardInputGradFactory
         all_cores,
         cb_data_format,
         {
-            {CB::c_in0, in0_t},  // output_grad
-            {CB::c_in1, in1_t},  // input
-            {CB::c_in2, in2_t},  // mean
-            {CB::c_in3, in3_t},  // rstd
-            {CB::c_in4, in4_t},  // one
-            {CB::c_in5, in5_t},  // inner_size(==n)
-            {CB::c_in6, in6_t},
-            {CB::c_in7, in7_t},
-            {CB::c_out0, out0_t},  // input_grad
-            {CB::c_intermed0, im0_t},
-            {CB::c_intermed1, im1_t},
-            {CB::c_intermed2, im2_t},
-            {CB::c_intermed3, im3_t},
-            {CB::c_intermed4, im4_t},
-            {CB::c_intermed5, im5_t},
-            {CB::c_intermed6, im6_t},
-            {CB::c_intermed7, im7_t},
+            {CBIndex::c_0, in0_t},  // output_grad
+            {CBIndex::c_1, in1_t},  // input
+            {CBIndex::c_2, in2_t},  // mean
+            {CBIndex::c_3, in3_t},  // rstd
+            {CBIndex::c_4, in4_t},  // one
+            {CBIndex::c_5, in5_t},  // inner_size(==n)
+            {CBIndex::c_6, in6_t},
+            {CBIndex::c_7, in7_t},
+            {CBIndex::c_16, out0_t},  // input_grad
+            {CBIndex::c_24, im0_t},
+            {CBIndex::c_25, im1_t},
+            {CBIndex::c_26, im2_t},
+            {CBIndex::c_27, im3_t},
+            {CBIndex::c_28, im4_t},
+            {CBIndex::c_29, im5_t},
+            {CBIndex::c_30, im6_t},
+            {CBIndex::c_31, im7_t},
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/moreh_group_norm_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/moreh_group_norm_backward.cpp
index e0f05e63a7b..20054ba297d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/moreh_group_norm_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_group_norm_backward/moreh_group_norm_backward.cpp
@@ -7,6 +7,8 @@
 #include "device/gamma_beta_grad/moreh_group_norm_backward_gamma_beta_grad_device_operation.hpp"
 #include "device/input_grad/moreh_group_norm_backward_input_grad_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_group_norm_backward {
 std::vector<std::optional<Tensor>> MorehGroupNormBackward::invoke(
     const Tensor& output_grad,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_large_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_large_kernel.cpp
index bfa667c8898..9e2db506593 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_large_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_large_kernel.cpp
@@ -22,28 +22,28 @@ void MAIN {
     constexpr bool is_lastdim_layernorm = get_compile_time_arg_val(9) == 1;
     constexpr bool is_groupnorm = get_compile_time_arg_val(10) == 1;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
-
-    constexpr auto cb_x = tt::CB::c_in0;       // input
-    constexpr auto cb_scaler = tt::CB::c_in1;  // scaler
-    constexpr auto cb_eps = tt::CB::c_in2;     // epsilon
-    constexpr auto cb_gamma = tt::CB::c_in3;   // gamma
-    constexpr auto cb_beta = tt::CB::c_in4;    // beta
-    constexpr auto cb_mask_h = tt::CB::c_in5;  // mask_h
-    constexpr auto cb_mask_w = tt::CB::c_in6;  // mask_w
-
-    constexpr auto cb_out = tt::CB::c_out0;   // output
-    constexpr auto cb_mean = tt::CB::c_out1;  // mean
-    constexpr auto cb_rstd = tt::CB::c_out2;  // rstd
-
-    constexpr auto cb_ex = tt::CB::c_intermed0;          // E[x]
-    constexpr auto cb_xmm = tt::CB::c_intermed1;         // x - E[x]
-    constexpr auto cb_xmm2 = tt::CB::c_intermed2;        // (x - E[x])^2
-    constexpr auto cb_xmm2sum = tt::CB::c_intermed3;     // Sum[(x - E[x])^2]
-    constexpr auto cb_var = tt::CB::c_intermed4;         // E[(x - E[x])^2] = Var[x]
-    constexpr auto cb_recip_std = tt::CB::c_intermed5;   // 1.0/(sqrt(Var[x] + eps))
-    constexpr auto cb_gamma_beta = tt::CB::c_intermed6;  // p * gamm + beta
-    constexpr auto cb_xsum = tt::CB::c_intermed7;        // Sum[x]
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0);
+
+    constexpr auto cb_x = tt::CBIndex::c_0;       // input
+    constexpr auto cb_scaler = tt::CBIndex::c_1;  // scaler
+    constexpr auto cb_eps = tt::CBIndex::c_2;     // epsilon
+    constexpr auto cb_gamma = tt::CBIndex::c_3;   // gamma
+    constexpr auto cb_beta = tt::CBIndex::c_4;    // beta
+    constexpr auto cb_mask_h = tt::CBIndex::c_5;  // mask_h
+    constexpr auto cb_mask_w = tt::CBIndex::c_6;  // mask_w
+
+    constexpr auto cb_out = tt::CBIndex::c_16;   // output
+    constexpr auto cb_mean = tt::CBIndex::c_17;  // mean
+    constexpr auto cb_rstd = tt::CBIndex::c_18;  // rstd
+
+    constexpr auto cb_ex = tt::CBIndex::c_24;          // E[x]
+    constexpr auto cb_xmm = tt::CBIndex::c_25;         // x - E[x]
+    constexpr auto cb_xmm2 = tt::CBIndex::c_26;        // (x - E[x])^2
+    constexpr auto cb_xmm2sum = tt::CBIndex::c_27;     // Sum[(x - E[x])^2]
+    constexpr auto cb_var = tt::CBIndex::c_28;         // E[(x - E[x])^2] = Var[x]
+    constexpr auto cb_recip_std = tt::CBIndex::c_29;   // 1.0/(sqrt(Var[x] + eps))
+    constexpr auto cb_gamma_beta = tt::CBIndex::c_30;  // p * gamm + beta
+    constexpr auto cb_xsum = tt::CBIndex::c_31;        // Sum[x]
 
     constexpr uint32_t onetile = 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_small_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_small_kernel.cpp
index c49fc5e641d..4b4fbe009ce 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_small_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/moreh_layer_norm_small_kernel.cpp
@@ -22,28 +22,28 @@ void MAIN {
     constexpr bool is_lastdim_layernorm = get_compile_time_arg_val(9) == 1;
     constexpr bool is_groupnorm = get_compile_time_arg_val(10) == 1;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
-
-    constexpr auto cb_x = tt::CB::c_in0;       // input
-    constexpr auto cb_scaler = tt::CB::c_in1;  // scaler
-    constexpr auto cb_eps = tt::CB::c_in2;     // epsilon
-    constexpr auto cb_gamma = tt::CB::c_in3;   // gamma
-    constexpr auto cb_beta = tt::CB::c_in4;    // beta
-    constexpr auto cb_mask_h = tt::CB::c_in5;  // mask_h
-    constexpr auto cb_mask_w = tt::CB::c_in6;  // mask_w
-
-    constexpr auto cb_out = tt::CB::c_out0;   // output
-    constexpr auto cb_mean = tt::CB::c_out1;  // mean
-    constexpr auto cb_rstd = tt::CB::c_out2;  // rstd
-
-    constexpr auto cb_ex = tt::CB::c_intermed0;          // E[x]
-    constexpr auto cb_xmm = tt::CB::c_intermed1;         // x - E[x]
-    constexpr auto cb_xmm2 = tt::CB::c_intermed2;        // (x - E[x])^2
-    constexpr auto cb_xmm2sum = tt::CB::c_intermed3;     // Sum[(x - E[x])^2]
-    constexpr auto cb_var = tt::CB::c_intermed4;         // E[(x - E[x])^2] = Var[x]
-    constexpr auto cb_recip_std = tt::CB::c_intermed5;   // 1.0/(sqrt(Var[x] + eps))
-    constexpr auto cb_gamma_beta = tt::CB::c_intermed6;  // p * gamm + beta
-    constexpr auto cb_xsum = tt::CB::c_intermed7;        // Sum[x]
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0);
+
+    constexpr auto cb_x = tt::CBIndex::c_0;       // input
+    constexpr auto cb_scaler = tt::CBIndex::c_1;  // scaler
+    constexpr auto cb_eps = tt::CBIndex::c_2;     // epsilon
+    constexpr auto cb_gamma = tt::CBIndex::c_3;   // gamma
+    constexpr auto cb_beta = tt::CBIndex::c_4;    // beta
+    constexpr auto cb_mask_h = tt::CBIndex::c_5;  // mask_h
+    constexpr auto cb_mask_w = tt::CBIndex::c_6;  // mask_w
+
+    constexpr auto cb_out = tt::CBIndex::c_16;   // output
+    constexpr auto cb_mean = tt::CBIndex::c_17;  // mean
+    constexpr auto cb_rstd = tt::CBIndex::c_18;  // rstd
+
+    constexpr auto cb_ex = tt::CBIndex::c_24;          // E[x]
+    constexpr auto cb_xmm = tt::CBIndex::c_25;         // x - E[x]
+    constexpr auto cb_xmm2 = tt::CBIndex::c_26;        // (x - E[x])^2
+    constexpr auto cb_xmm2sum = tt::CBIndex::c_27;     // Sum[(x - E[x])^2]
+    constexpr auto cb_var = tt::CBIndex::c_28;         // E[(x - E[x])^2] = Var[x]
+    constexpr auto cb_recip_std = tt::CBIndex::c_29;   // 1.0/(sqrt(Var[x] + eps))
+    constexpr auto cb_gamma_beta = tt::CBIndex::c_30;  // p * gamm + beta
+    constexpr auto cb_xsum = tt::CBIndex::c_31;        // Sum[x]
 
     constexpr uint32_t onetile = 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_large.cpp
index 527575640b2..c56f60fc5ac 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_large.cpp
@@ -17,13 +17,13 @@ void kernel_main() {
     const auto mask_h = get_arg_val<uint32_t>(i++);
     const auto mask_w = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_id_input = tt::CB::c_in0;
-    constexpr uint32_t cb_id_scaler = tt::CB::c_in1;
-    constexpr uint32_t cb_id_eps = tt::CB::c_in2;
-    constexpr uint32_t cb_id_gamma = tt::CB::c_in3;
-    constexpr uint32_t cb_id_beta = tt::CB::c_in4;
-    constexpr uint32_t cb_id_mask_h = tt::CB::c_in5;
-    constexpr uint32_t cb_id_mask_w = tt::CB::c_in6;
+    constexpr uint32_t cb_id_input = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_scaler = tt::CBIndex::c_1;
+    constexpr uint32_t cb_id_eps = tt::CBIndex::c_2;
+    constexpr uint32_t cb_id_gamma = tt::CBIndex::c_3;
+    constexpr uint32_t cb_id_beta = tt::CBIndex::c_4;
+    constexpr uint32_t cb_id_mask_h = tt::CBIndex::c_5;
+    constexpr uint32_t cb_id_mask_w = tt::CBIndex::c_6;
 
     const uint32_t input_tile_bytes = get_tile_size(cb_id_input);
     const auto input_data_format = get_dataformat(cb_id_input);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_small.cpp
index 4c6f6f56a18..d2652a3c188 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/reader_moreh_layer_norm_small.cpp
@@ -17,13 +17,13 @@ void kernel_main() {
     const auto mask_h = get_arg_val<uint32_t>(i++);
     const auto mask_w = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_id_input = tt::CB::c_in0;
-    constexpr uint32_t cb_id_scaler = tt::CB::c_in1;
-    constexpr uint32_t cb_id_eps = tt::CB::c_in2;
-    constexpr uint32_t cb_id_gamma = tt::CB::c_in3;
-    constexpr uint32_t cb_id_beta = tt::CB::c_in4;
-    constexpr uint32_t cb_id_mask_h = tt::CB::c_in5;
-    constexpr uint32_t cb_id_mask_w = tt::CB::c_in6;
+    constexpr uint32_t cb_id_input = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_scaler = tt::CBIndex::c_1;
+    constexpr uint32_t cb_id_eps = tt::CBIndex::c_2;
+    constexpr uint32_t cb_id_gamma = tt::CBIndex::c_3;
+    constexpr uint32_t cb_id_beta = tt::CBIndex::c_4;
+    constexpr uint32_t cb_id_mask_h = tt::CBIndex::c_5;
+    constexpr uint32_t cb_id_mask_w = tt::CBIndex::c_6;
 
     const uint32_t input_tile_bytes = get_tile_size(cb_id_input);
     const auto input_data_format = get_dataformat(cb_id_input);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/writer_moreh_layer_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/writer_moreh_layer_norm.cpp
index 60f7d8cd004..c0f979b7f27 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/writer_moreh_layer_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/kernels/writer_moreh_layer_norm.cpp
@@ -102,9 +102,9 @@ void kernel_main() {
     constexpr bool rstd_has_value = get_compile_time_arg_val(4) == 1;
     constexpr uint32_t block_size = get_compile_time_arg_val(5);
 
-    constexpr uint32_t cb_id_output = tt::CB::c_out0;
-    constexpr uint32_t cb_id_mean = tt::CB::c_out1;
-    constexpr uint32_t cb_id_rstd = tt::CB::c_out2;
+    constexpr uint32_t cb_id_output = tt::CBIndex::c_16;
+    constexpr uint32_t cb_id_mean = tt::CBIndex::c_17;
+    constexpr uint32_t cb_id_rstd = tt::CBIndex::c_18;
 
     // output
     const uint32_t output_tile_bytes = get_tile_size(cb_id_output);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_program_factory.cpp
index e18beaaf846..ee36024948e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_program_factory.cpp
@@ -173,24 +173,24 @@ MorehLayerNormOperation::ProgramFactory::cached_program_t MorehLayerNormOperatio
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},                            // input
-            {tt::CB::c_in1, in1_t},                            // scaler
-            {tt::CB::c_in2, in2_t},                            // epsilon
-            {tt::CB::c_in3, in3_t},                            // gamma
-            {tt::CB::c_in4, in4_t},                            // beta
-            {tt::CB::c_in5, in5_t},                            // mask_h
-            {tt::CB::c_in6, in6_t},                            // mask_w
-            {tt::CB::c_out0, out0_t},                          // output
-            {tt::CB::c_out1, out1_t},                          // mean
-            {tt::CB::c_out2, out2_t},                          // rstd
-            {tt::CB::c_intermed0, im0_t, intermed_cb_format},  // E[x]
-            {tt::CB::c_intermed1, im1_t, intermed_cb_format},  // x - E[x]
-            {tt::CB::c_intermed2, im2_t, intermed_cb_format},  // (x - E[x])^2
-            {tt::CB::c_intermed3, im3_t, intermed_cb_format},  // Sum[(x - E[x])^2]
-            {tt::CB::c_intermed4, im4_t, intermed_cb_format},  // E[(x - E[x])^2] = Var[x]
-            {tt::CB::c_intermed5, im5_t, intermed_cb_format},  // 1.0/(sqrt(Var[x] + eps))
-            {tt::CB::c_intermed6, im6_t, intermed_cb_format},  // y * gamm + beta
-            {tt::CB::c_intermed7, im7_t, intermed_cb_format},  // Sum[x]
+            {tt::CBIndex::c_0, in0_t},                            // input
+            {tt::CBIndex::c_1, in1_t},                            // scaler
+            {tt::CBIndex::c_2, in2_t},                            // epsilon
+            {tt::CBIndex::c_3, in3_t},                            // gamma
+            {tt::CBIndex::c_4, in4_t},                            // beta
+            {tt::CBIndex::c_5, in5_t},                            // mask_h
+            {tt::CBIndex::c_6, in6_t},                            // mask_w
+            {tt::CBIndex::c_16, out0_t},                          // output
+            {tt::CBIndex::c_17, out1_t},                          // mean
+            {tt::CBIndex::c_18, out2_t},                          // rstd
+            {tt::CBIndex::c_24, im0_t, intermed_cb_format},  // E[x]
+            {tt::CBIndex::c_25, im1_t, intermed_cb_format},  // x - E[x]
+            {tt::CBIndex::c_26, im2_t, intermed_cb_format},  // (x - E[x])^2
+            {tt::CBIndex::c_27, im3_t, intermed_cb_format},  // Sum[(x - E[x])^2]
+            {tt::CBIndex::c_28, im4_t, intermed_cb_format},  // E[(x - E[x])^2] = Var[x]
+            {tt::CBIndex::c_29, im5_t, intermed_cb_format},  // 1.0/(sqrt(Var[x] + eps))
+            {tt::CBIndex::c_30, im6_t, intermed_cb_format},  // y * gamm + beta
+            {tt::CBIndex::c_31, im7_t, intermed_cb_format},  // Sum[x]
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/moreh_layer_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/moreh_layer_norm.cpp
index d27c789a098..eca8a179878 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/moreh_layer_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm/moreh_layer_norm.cpp
@@ -6,6 +6,8 @@
 
 #include "ttnn/operations/moreh/moreh_layer_norm/device/moreh_layer_norm_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_layer_norm {
 std::vector<std::optional<Tensor>> MorehLayerNorm::invoke(
     const Tensor& input,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_gamma_beta_grad_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_gamma_beta_grad_kernel.cpp
index d028f9c26a3..18dd55f88df 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_gamma_beta_grad_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_gamma_beta_grad_kernel.cpp
@@ -16,26 +16,26 @@ void MAIN {
     constexpr bool is_lastdim_layernorm = get_compile_time_arg_val(7) == 1;
     constexpr bool is_groupnorm = get_compile_time_arg_val(8) == 1;
 
-    constexpr auto cb_dy = tt::CB::c_in0;      // output_grad(==dy)
-    constexpr auto cb_x = tt::CB::c_in1;       // input(==x)
-    constexpr auto cb_mean = tt::CB::c_in2;    // mean
-    constexpr auto cb_rstd = tt::CB::c_in3;    // rstd
-    constexpr auto cb_scaler = tt::CB::c_in4;  // scaler
-    constexpr auto cb_mask_h = tt::CB::c_in5;  // mask_h
-    constexpr auto cb_mask_w = tt::CB::c_in6;  // mask_w
+    constexpr auto cb_dy = tt::CBIndex::c_0;      // output_grad(==dy)
+    constexpr auto cb_x = tt::CBIndex::c_1;       // input(==x)
+    constexpr auto cb_mean = tt::CBIndex::c_2;    // mean
+    constexpr auto cb_rstd = tt::CBIndex::c_3;    // rstd
+    constexpr auto cb_scaler = tt::CBIndex::c_4;  // scaler
+    constexpr auto cb_mask_h = tt::CBIndex::c_5;  // mask_h
+    constexpr auto cb_mask_w = tt::CBIndex::c_6;  // mask_w
 
     // Sum[y * dy]
-    constexpr auto cb_dgamma = tt::CB::c_out0;  // gamma_grad(==dgamma)
+    constexpr auto cb_dgamma = tt::CBIndex::c_16;  // gamma_grad(==dgamma)
     // Sum[dy]
-    constexpr auto cb_dbeta = tt::CB::c_out1;  // beta_grad(==dbeta)
+    constexpr auto cb_dbeta = tt::CBIndex::c_17;  // beta_grad(==dbeta)
 
     // y = (x - mean) * rstd
-    constexpr auto cb_y = tt::CB::c_intermed0;       // output(==y)
-    constexpr auto cb_ydy = tt::CB::c_intermed1;     // y * dy
-    constexpr auto cb_dyadd = tt::CB::c_intermed2;   // Add[dy]
-    constexpr auto cb_ydyadd = tt::CB::c_intermed3;  // Add[y * dy]
-    constexpr auto cb_xmm = tt::CB::c_intermed4;     // x - mean
-    constexpr auto cb_dycopy = tt::CB::c_intermed5;  // dycopy
+    constexpr auto cb_y = tt::CBIndex::c_24;       // output(==y)
+    constexpr auto cb_ydy = tt::CBIndex::c_25;     // y * dy
+    constexpr auto cb_dyadd = tt::CBIndex::c_26;   // Add[dy]
+    constexpr auto cb_ydyadd = tt::CBIndex::c_27;  // Add[y * dy]
+    constexpr auto cb_xmm = tt::CBIndex::c_28;     // x - mean
+    constexpr auto cb_dycopy = tt::CBIndex::c_29;  // dycopy
 
     constexpr uint32_t onetile = 1;
 
@@ -54,7 +54,8 @@ void MAIN {
 
     constexpr uint32_t HtWt = Ht * Wt;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+    constexpr auto cb_out_init = gamma_grad_has_value ? cb_dgamma : cb_dbeta;
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0, cb_out_init);
 
     cb_wait_front(cb_scaler, onetile);  // comes from the reader
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_large_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_large_kernel.cpp
index 3b4fb2b1df5..c28dca031bd 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_large_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_large_kernel.cpp
@@ -18,29 +18,29 @@ void MAIN {
     constexpr bool is_lastdim_layernorm = get_compile_time_arg_val(5) == 1;
     constexpr bool is_groupnorm = get_compile_time_arg_val(6) == 1;
 
-    binary_op_init_common(tt::CB::c_in1, tt::CB::c_in2);
+    binary_op_init_common(tt::CBIndex::c_1, tt::CBIndex::c_2);
 
-    constexpr auto cb_dy = tt::CB::c_in0;         // output_grad(==dy)
-    constexpr auto cb_x = tt::CB::c_in1;          // input(==x)
-    constexpr auto cb_mean = tt::CB::c_in2;       // mean
-    constexpr auto cb_rstd = tt::CB::c_in3;       // rstd
-    constexpr auto cb_scaler = tt::CB::c_in4;     // scaler
-    constexpr auto cb_n_recip_n = tt::CB::c_in5;  // n_recip_n
-    constexpr auto cb_gamma = tt::CB::c_in6;      // gamma
-    constexpr auto cb_mask_h_w = tt::CB::c_in7;   // mask_h_w
+    constexpr auto cb_dy = tt::CBIndex::c_0;         // output_grad(==dy)
+    constexpr auto cb_x = tt::CBIndex::c_1;          // input(==x)
+    constexpr auto cb_mean = tt::CBIndex::c_2;       // mean
+    constexpr auto cb_rstd = tt::CBIndex::c_3;       // rstd
+    constexpr auto cb_scaler = tt::CBIndex::c_4;     // scaler
+    constexpr auto cb_n_recip_n = tt::CBIndex::c_5;  // n_recip_n
+    constexpr auto cb_gamma = tt::CBIndex::c_6;      // gamma
+    constexpr auto cb_mask_h_w = tt::CBIndex::c_7;   // mask_h_w
 
     // ((n * dy - Sum[dy]) - (y * Sum[y * dy])) * (rstd / n)
-    constexpr auto cb_dx = tt::CB::c_out0;  // input_grad(==dx)
+    constexpr auto cb_dx = tt::CBIndex::c_16;  // input_grad(==dx)
 
     // y = (x - mean) * rstd
-    constexpr auto cb_dycopy = tt::CB::c_intermed0;  // copy output_grad(==dycopy)
-    constexpr auto cb_y = tt::CB::c_intermed1;       // output(==y)
-    constexpr auto cb_dysum = tt::CB::c_intermed2;   // Sum[dy]
-    constexpr auto cb_ydysum = tt::CB::c_intermed3;  // Sum[y * dy]
-
-    constexpr auto cb_tmp1 = tt::CB::c_intermed4;  // tmp1
-    constexpr auto cb_tmp2 = tt::CB::c_intermed5;  // tmp2
-    constexpr auto cb_tmp3 = tt::CB::c_intermed6;  // tmp3
+    constexpr auto cb_dycopy = tt::CBIndex::c_24;  // copy output_grad(==dycopy)
+    constexpr auto cb_y = tt::CBIndex::c_25;       // output(==y)
+    constexpr auto cb_dysum = tt::CBIndex::c_26;   // Sum[dy]
+    constexpr auto cb_ydysum = tt::CBIndex::c_27;  // Sum[y * dy]
+
+    constexpr auto cb_tmp1 = tt::CBIndex::c_28;  // tmp1
+    constexpr auto cb_tmp2 = tt::CBIndex::c_29;  // tmp2
+    constexpr auto cb_tmp3 = tt::CBIndex::c_30;  // tmp3
 
     constexpr uint32_t onetile = 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_small_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_small_kernel.cpp
index aa2729982b4..d5732570eea 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_small_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/kernels/moreh_layer_norm_backward_input_grad_small_kernel.cpp
@@ -18,30 +18,30 @@ void MAIN {
     constexpr bool is_lastdim_layernorm = get_compile_time_arg_val(5) == 1;
     constexpr bool is_groupnorm = get_compile_time_arg_val(6) == 1;
 
-    binary_op_init_common(tt::CB::c_in1, tt::CB::c_in2);
+    binary_op_init_common(tt::CBIndex::c_1, tt::CBIndex::c_2);
 
-    constexpr auto cb_dy = tt::CB::c_in0;         // output_grad(==dy)
-    constexpr auto cb_x = tt::CB::c_in1;          // input(==x)
-    constexpr auto cb_mean = tt::CB::c_in2;       // mean
-    constexpr auto cb_rstd = tt::CB::c_in3;       // rstd
-    constexpr auto cb_scaler = tt::CB::c_in4;     // scaler
-    constexpr auto cb_n_recip_n = tt::CB::c_in5;  // n_recip_n
-    constexpr auto cb_gamma = tt::CB::c_in6;      // gamma
-    constexpr auto cb_mask_h_w = tt::CB::c_in7;   // mask_h_w
+    constexpr auto cb_dy = tt::CBIndex::c_0;         // output_grad(==dy)
+    constexpr auto cb_x = tt::CBIndex::c_1;          // input(==x)
+    constexpr auto cb_mean = tt::CBIndex::c_2;       // mean
+    constexpr auto cb_rstd = tt::CBIndex::c_3;       // rstd
+    constexpr auto cb_scaler = tt::CBIndex::c_4;     // scaler
+    constexpr auto cb_n_recip_n = tt::CBIndex::c_5;  // n_recip_n
+    constexpr auto cb_gamma = tt::CBIndex::c_6;      // gamma
+    constexpr auto cb_mask_h_w = tt::CBIndex::c_7;   // mask_h_w
 
     // dx = ((n * dy - Sum[dy]) - (y * Sum[y * dy])) * (rstd / n)
-    constexpr auto cb_dx = tt::CB::c_out0;  // input_grad(==dx)
+    constexpr auto cb_dx = tt::CBIndex::c_16;  // input_grad(==dx)
 
     // y = (x - mean) * rstd
-    constexpr auto cb_dycopy = tt::CB::c_intermed0;       // copy output_grad(==dycopy)
-    constexpr auto cb_y = tt::CB::c_intermed1;            // output(==y)
-    constexpr auto cb_dysum = tt::CB::c_intermed2;        // Sum[dy]
-    constexpr auto cb_ydysum = tt::CB::c_intermed3;       // Sum[y * dy]
-    constexpr auto cb_recip_nrstd = tt::CB::c_intermed4;  // rstd / n
-
-    constexpr auto cb_tmp1 = tt::CB::c_intermed5;  // tmp1
-    constexpr auto cb_tmp2 = tt::CB::c_intermed6;  // tmp2
-    constexpr auto cb_tmp3 = tt::CB::c_intermed7;  // tmp3
+    constexpr auto cb_dycopy = tt::CBIndex::c_24;       // copy output_grad(==dycopy)
+    constexpr auto cb_y = tt::CBIndex::c_25;            // output(==y)
+    constexpr auto cb_dysum = tt::CBIndex::c_26;        // Sum[dy]
+    constexpr auto cb_ydysum = tt::CBIndex::c_27;       // Sum[y * dy]
+    constexpr auto cb_recip_nrstd = tt::CBIndex::c_28;  // rstd / n
+
+    constexpr auto cb_tmp1 = tt::CBIndex::c_29;  // tmp1
+    constexpr auto cb_tmp2 = tt::CBIndex::c_30;  // tmp2
+    constexpr auto cb_tmp3 = tt::CBIndex::c_31;  // tmp3
 
     constexpr uint32_t onetile = 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_program_factory.cpp
index 6f020289c6a..8d8095f3938 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_gamma_beta_grad_program_factory.cpp
@@ -118,20 +118,20 @@ MorehLayerNormBackwardGammaBetaGradOperation::ProgramFactory::create(
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},                            // output_grad(==dy)
-            {tt::CB::c_in1, in1_t},                            // input(==x)
-            {tt::CB::c_in2, in2_t},                            // mean
-            {tt::CB::c_in3, in3_t},                            // rstd
-            {tt::CB::c_in4, in4_t},                            // scaler
-            {tt::CB::c_in5, in5_t},                            // mask_h
-            {tt::CB::c_out0, out0_t},                          // gamma_grad(==dgamma)
-            {tt::CB::c_out1, out1_t},                          // beta_grad(==dbeta)
-            {tt::CB::c_intermed0, im0_t, intermed_cb_format},  // output(==y)
-            {tt::CB::c_intermed1, im1_t, intermed_cb_format},  // y * dy
-            {tt::CB::c_intermed2, im2_t, intermed_cb_format},  // Add[dy]
-            {tt::CB::c_intermed3, im3_t, intermed_cb_format},  // Add[y * dy]
-            {tt::CB::c_intermed4, im4_t, intermed_cb_format},  // x - mean
-            {tt::CB::c_intermed5, im5_t, intermed_cb_format},  // dycopy
+            {tt::CBIndex::c_0, in0_t},                            // output_grad(==dy)
+            {tt::CBIndex::c_1, in1_t},                            // input(==x)
+            {tt::CBIndex::c_2, in2_t},                            // mean
+            {tt::CBIndex::c_3, in3_t},                            // rstd
+            {tt::CBIndex::c_4, in4_t},                            // scaler
+            {tt::CBIndex::c_5, in5_t},                            // mask_h
+            {tt::CBIndex::c_16, out0_t},                          // gamma_grad(==dgamma)
+            {tt::CBIndex::c_17, out1_t},                          // beta_grad(==dbeta)
+            {tt::CBIndex::c_24, im0_t, intermed_cb_format},  // output(==y)
+            {tt::CBIndex::c_25, im1_t, intermed_cb_format},  // y * dy
+            {tt::CBIndex::c_26, im2_t, intermed_cb_format},  // Add[dy]
+            {tt::CBIndex::c_27, im3_t, intermed_cb_format},  // Add[y * dy]
+            {tt::CBIndex::c_28, im4_t, intermed_cb_format},  // x - mean
+            {tt::CBIndex::c_29, im5_t, intermed_cb_format},  // dycopy
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_program_factory.cpp
index 9beb28cbbb2..e20c4ac037c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/device/moreh_layer_norm_backward_input_grad_program_factory.cpp
@@ -134,23 +134,23 @@ MorehLayerNormBackwardInputGradOperation::ProgramFactory::create(
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},                            // output_grad(==dy)
-            {tt::CB::c_in1, in1_t},                            // input(==x)
-            {tt::CB::c_in2, in2_t},                            // mean
-            {tt::CB::c_in3, in3_t},                            // rstd
-            {tt::CB::c_in4, in4_t},                            // scaler
-            {tt::CB::c_in5, in5_t},                            // n_recip_n
-            {tt::CB::c_in6, in6_t},                            // gamma
-            {tt::CB::c_in7, in7_t},                            // mask_h_w
-            {tt::CB::c_out0, out0_t},                          // input_grad(==dx)
-            {tt::CB::c_intermed0, im0_t, intermed_cb_format},  // copy output_grad(==dy or dy * gamma)
-            {tt::CB::c_intermed1, im1_t, intermed_cb_format},  // output(==y)
-            {tt::CB::c_intermed2, im2_t, intermed_cb_format},  // Sum[dy]
-            {tt::CB::c_intermed3, im3_t, intermed_cb_format},  // Sum[y * dy]
-            {tt::CB::c_intermed4, im4_t, intermed_cb_format},  // rstd / n
-            {tt::CB::c_intermed5, im5_t, intermed_cb_format},
-            {tt::CB::c_intermed6, im6_t, intermed_cb_format},
-            {tt::CB::c_intermed7, im7_t, intermed_cb_format},
+            {tt::CBIndex::c_0, in0_t},                            // output_grad(==dy)
+            {tt::CBIndex::c_1, in1_t},                            // input(==x)
+            {tt::CBIndex::c_2, in2_t},                            // mean
+            {tt::CBIndex::c_3, in3_t},                            // rstd
+            {tt::CBIndex::c_4, in4_t},                            // scaler
+            {tt::CBIndex::c_5, in5_t},                            // n_recip_n
+            {tt::CBIndex::c_6, in6_t},                            // gamma
+            {tt::CBIndex::c_7, in7_t},                            // mask_h_w
+            {tt::CBIndex::c_16, out0_t},                          // input_grad(==dx)
+            {tt::CBIndex::c_24, im0_t, intermed_cb_format},  // copy output_grad(==dy or dy * gamma)
+            {tt::CBIndex::c_25, im1_t, intermed_cb_format},  // output(==y)
+            {tt::CBIndex::c_26, im2_t, intermed_cb_format},  // Sum[dy]
+            {tt::CBIndex::c_27, im3_t, intermed_cb_format},  // Sum[y * dy]
+            {tt::CBIndex::c_28, im4_t, intermed_cb_format},  // rstd / n
+            {tt::CBIndex::c_29, im5_t, intermed_cb_format},
+            {tt::CBIndex::c_30, im6_t, intermed_cb_format},
+            {tt::CBIndex::c_31, im7_t, intermed_cb_format},
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/moreh_layer_norm_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/moreh_layer_norm_backward.cpp
index 148be4c24e9..45164ae3f85 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/moreh_layer_norm_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_layer_norm_backward/moreh_layer_norm_backward.cpp
@@ -7,6 +7,8 @@
 #include "device/moreh_layer_norm_backward_gamma_beta_grad_device_operation.hpp"
 #include "device/moreh_layer_norm_backward_input_grad_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_layer_norm_backward {
 std::vector<std::optional<Tensor>> moreh_layer_norm_backward_gamma_beta_grad(
     const Tensor& output_grad,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp
index ff47e158651..2e1b363963a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp
@@ -14,12 +14,12 @@ void MAIN {
     const bool do_mask_h = (arg_fetcher.get_next_arg_val<uint32_t>() == 1);
     const bool do_mask_w = (arg_fetcher.get_next_arg_val<uint32_t>() == 1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_scaler = tt::CB::c_in1;
-    constexpr auto cb_mask_h_w = tt::CB::c_in2;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
-    constexpr auto cb_intermed1 = tt::CB::c_intermed1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_scaler = tt::CBIndex::c_1;
+    constexpr auto cb_mask_h_w = tt::CBIndex::c_2;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
+    constexpr auto cb_intermed1 = tt::CBIndex::c_25;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_single_core_hw.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_single_core_hw.cpp
index 2ab4eefc5fe..2d5a8a8f1ca 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_single_core_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_single_core_hw.cpp
@@ -13,12 +13,12 @@ void MAIN {
     const bool do_mask_h = (arg_fetcher.get_next_arg_val<uint32_t>() == 1);
     const bool do_mask_w = (arg_fetcher.get_next_arg_val<uint32_t>() == 1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_scaler = tt::CB::c_in1;
-    constexpr auto cb_mask_h_w = tt::CB::c_in2;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
-    constexpr auto cb_intermed1 = tt::CB::c_intermed1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_scaler = tt::CBIndex::c_1;
+    constexpr auto cb_mask_h_w = tt::CBIndex::c_2;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
+    constexpr auto cb_intermed1 = tt::CBIndex::c_25;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_multi_core_program_factory.cpp
index 597e9ba21de..2f3397d884f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_multi_core_program_factory.cpp
@@ -75,12 +75,12 @@ MorehBiasAddBackwardOperation::MultiCoreProgramFactory::create(
         program,
         all_cores,
         cb_data_format,
-        {{CB::c_in0, in0_t},    // output_grad
-         {CB::c_in1, in1_t},    // scaler
-         {CB::c_in2, in2_t},    // mask_h_w
-         {CB::c_out0, out0_t},  // bias_grad
-         {CB::c_intermed0, im0_t},
-         {CB::c_intermed1, im1_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format}});
+        {{CBIndex::c_0, in0_t},    // output_grad
+         {CBIndex::c_1, in1_t},    // scaler
+         {CBIndex::c_2, in2_t},    // mask_h_w
+         {CBIndex::c_16, out0_t},  // bias_grad
+         {CBIndex::c_24, im0_t},
+         {CBIndex::c_25, im1_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format}});
 
     ////////////////////////////////////////////////////////////////////////////
     //                      DataMovementKernel SetUp
@@ -113,7 +113,7 @@ MorehBiasAddBackwardOperation::MultiCoreProgramFactory::create(
     std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (fp32_dest_acc_en) {
         compute_defines["FP32_DEST_ACC_EN"] = "1";
-        unpack_to_dest_mode[tt::CB::c_intermed1] = UnpackToDestMode::UnpackToDestFp32;
+        unpack_to_dest_mode[tt::CBIndex::c_25] = UnpackToDestMode::UnpackToDestFp32;
     }
     const auto compute_kernel_file =
         "ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/kernels/moreh_bias_backward_multi_core_h.cpp";
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
index 5f47d621149..f0f36ab5c0d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_linear_backward/device/moreh_linear_backward_single_core_program_factory.cpp
@@ -69,12 +69,12 @@ MorehBiasAddBackwardOperation::SingleCoreProgramFactory::create(
         program,
         std::set<CoreRange>{CoreRange(core, core)},
         cb_data_format,
-        {{CB::c_in0, in0_t},    // output_grad
-         {CB::c_in1, in1_t},    // scaler
-         {CB::c_in2, in2_t},    // mask_h_w
-         {CB::c_out0, out0_t},  // bias_grad
-         {CB::c_intermed0, im0_t},
-         {CB::c_intermed1, im1_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format}});
+        {{CBIndex::c_0, in0_t},    // output_grad
+         {CBIndex::c_1, in1_t},    // scaler
+         {CBIndex::c_2, in2_t},    // mask_h_w
+         {CBIndex::c_16, out0_t},  // bias_grad
+         {CBIndex::c_24, im0_t},
+         {CBIndex::c_25, im1_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format}});
 
     ////////////////////////////////////////////////////////////////////////////
     //                      DataMovementKernel SetUp
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/moreh_matmul.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/moreh_matmul.cpp
index f998666c3b4..c69b0e0867a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/moreh_matmul.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/kernels/moreh_matmul.cpp
@@ -18,16 +18,16 @@ constexpr uint32_t num_mask_tiles = 3;
 constexpr uint32_t MASK_TILE_H_IDX = 0;
 constexpr uint32_t MASK_TILE_W_IDX = 1;
 constexpr uint32_t MASK_TILE_HW_IDX = 2;
-constexpr uint32_t cb_in0 = tt::CB::c_in0;
-constexpr uint32_t cb_in1 = tt::CB::c_in1;
-constexpr uint32_t cb_in2 = tt::CB::c_in2;
-constexpr uint32_t cb_in3 = tt::CB::c_in3;
-constexpr uint32_t bias_cb_id = tt::CB::c_in4;
-constexpr uint32_t cb_out0 = tt::CB::c_out0;
-constexpr uint32_t cb_intermed0 = tt::CB::c_intermed0;
-constexpr uint32_t cb_intermed1 = tt::CB::c_intermed1;
-constexpr uint32_t cb_intermed2 = tt::CB::c_intermed2;
-constexpr uint32_t cb_intermed3 = tt::CB::c_intermed3;
+constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+constexpr uint32_t cb_in1 = tt::CBIndex::c_1;
+constexpr uint32_t cb_in2 = tt::CBIndex::c_2;
+constexpr uint32_t cb_in3 = tt::CBIndex::c_3;
+constexpr uint32_t bias_cb_id = tt::CBIndex::c_4;
+constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
+constexpr uint32_t cb_intermed0 = tt::CBIndex::c_24;
+constexpr uint32_t cb_intermed1 = tt::CBIndex::c_25;
+constexpr uint32_t cb_intermed2 = tt::CBIndex::c_26;
+constexpr uint32_t cb_intermed3 = tt::CBIndex::c_27;
 
 ////////////////////
 // inline functions
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
index db913983e01..ca87a1cb856 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_matmul/device/moreh_matmul_program_factory.cpp
@@ -288,16 +288,16 @@ MorehMatmulOperation::MultiCoreProgramFactory::cached_program_t MorehMatmulOpera
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},
-            {tt::CB::c_in1, in1_t},
-            {tt::CB::c_in2, in2_t},
-            {tt::CB::c_in3, in3_t},
-            {tt::CB::c_in4, in4_t},
-            {tt::CB::c_intermed0, im0_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format},
-            {tt::CB::c_intermed1, im1_t},
-            {tt::CB::c_intermed2, im2_t},
-            {tt::CB::c_intermed3, im3_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format},
-            {tt::CB::c_out0, out0_t},
+            {tt::CBIndex::c_0, in0_t},
+            {tt::CBIndex::c_1, in1_t},
+            {tt::CBIndex::c_2, in2_t},
+            {tt::CBIndex::c_3, in3_t},
+            {tt::CBIndex::c_4, in4_t},
+            {tt::CBIndex::c_24, im0_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format},
+            {tt::CBIndex::c_25, im1_t},
+            {tt::CBIndex::c_26, im2_t},
+            {tt::CBIndex::c_27, im3_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format},
+            {tt::CBIndex::c_16, out0_t},
         });
 
     ////////////////////////////////////////////////////////////////////////////
@@ -375,7 +375,7 @@ MorehMatmulOperation::MultiCoreProgramFactory::cached_program_t MorehMatmulOpera
     std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (fp32_dest_acc_en) {
         compute_defines["FP32_DEST_ACC_EN"] = "1";
-        unpack_to_dest_mode[tt::CB::c_intermed0] = UnpackToDestMode::UnpackToDestFp32;
+        unpack_to_dest_mode[tt::CBIndex::c_24] = UnpackToDestMode::UnpackToDestFp32;
     }
 
     const auto compute_kernel_1_id = CreateComputeKernel(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_h.cpp
index 46948f1797f..3621b5eb836 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_h.cpp
@@ -17,12 +17,12 @@ void MAIN {
     uint32_t NC = get_compile_time_arg_val(2);
     constexpr uint32_t origin_H = get_compile_time_arg_val(3);
 
-    auto cb_input = tt::CB::c_in0;
-    constexpr auto cb_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask_h = tt::CB::c_in3;
-    constexpr auto cb_accum_dst = tt::CB::c_intermed0;
-    constexpr auto cb_masked_input = tt::CB::c_intermed1;
-    constexpr auto cb_out = tt::CB::c_out0;
+    auto cb_input = tt::CBIndex::c_0;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask_h = tt::CBIndex::c_3;
+    constexpr auto cb_accum_dst = tt::CBIndex::c_24;
+    constexpr auto cb_masked_input = tt::CBIndex::c_25;
+    constexpr auto cb_out = tt::CBIndex::c_16;
     constexpr bool do_mask_h = (origin_H % TILE_HEIGHT) != 0;
 
     binary_op_init_common(cb_input, cb_input);
@@ -43,7 +43,7 @@ void MAIN {
             // tiles are expected to be coming in in NCWH order (H-contiguous)
             // reducing in W means out[0][w] = sum(h=0..H-1, in[h][w])
             // in this case we just sequentially add to accumulator all the H-tiles in a column
-            cb_input = tt::CB::c_in0;
+            cb_input = tt::CBIndex::c_0;
             bool is_h_single_tile = (Ht == 1);
 
             if (!is_h_single_tile) {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_nc.cpp
index 30cafa9ac02..82f063774cc 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_nc.cpp
@@ -14,17 +14,17 @@ void MAIN {
     const auto num_input_tiles = get_arg_val<uint32_t>(0);
     const auto num_output_tiles = get_arg_val<uint32_t>(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_scalar = tt::CB::c_in2;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_scalar = tt::CBIndex::c_2;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
     constexpr uint32_t first_tile = 0;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_1);
 
     cb_wait_front(cb_in1, onetile);
     cb_wait_front(cb_scalar, 1);  // scalar tile from the reader
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_w.cpp
index c71978bf468..63068e6a094 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/moreh_mean_w.cpp
@@ -18,12 +18,12 @@ void MAIN {
     uint32_t NC = get_compile_time_arg_val(2);
     constexpr uint32_t origin_W = get_compile_time_arg_val(3);
 
-    auto cb_input = tt::CB::c_in0;
-    constexpr auto cb_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask_w = tt::CB::c_in3;
-    constexpr auto cb_accum_dst = tt::CB::c_intermed0;
-    constexpr auto cb_masked_input = tt::CB::c_intermed1;
-    constexpr auto cb_out = tt::CB::c_out0;
+    auto cb_input = tt::CBIndex::c_0;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask_w = tt::CBIndex::c_3;
+    constexpr auto cb_accum_dst = tt::CBIndex::c_24;
+    constexpr auto cb_masked_input = tt::CBIndex::c_25;
+    constexpr auto cb_out = tt::CBIndex::c_16;
     constexpr bool do_mask_w = (origin_W % TILE_WIDTH) != 0;
 
     binary_op_init_common(cb_input, cb_input);
@@ -43,7 +43,7 @@ void MAIN {
             // tiles are expected to be coming in in NCHW order (W-contiguous)
             // reducing in W means out[h][0] = sum(w=0..W-1, in[h][w])
             // in this case we just sequentially add to accumulator all the W-tiles in a row
-            cb_input = tt::CB::c_in0;
+            cb_input = tt::CBIndex::c_0;
             bool is_w_single_tile = (Wt == 1);
             if (!is_w_single_tile) {
                 tile_regs_acquire();
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_h.cpp
index 08bacfd6c40..c3555c328f0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_h.cpp
@@ -17,7 +17,7 @@ void kernel_main() {
     constexpr uint32_t Wt = get_compile_time_arg_val(2);
     constexpr uint32_t HtWt = get_compile_time_arg_val(3);
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     // ublocks size defined in tiles
     constexpr uint32_t onetile = 1;
@@ -25,7 +25,7 @@ void kernel_main() {
     const DataFormat data_format = get_dataformat(cb_id_in0);
 
 #ifdef REDUCE_SCALER
-    constexpr uint32_t cb_id_in2 = tt::CB::c_in2;
+    constexpr uint32_t cb_id_in2 = tt::CBIndex::c_2;
     constexpr uint32_t scaler = get_compile_time_arg_val(4);
     cb_reserve_back(cb_id_in2, 1);
     constexpr uint32_t num_zeros_reads = 2048 / MEM_ZEROS_SIZE;
@@ -52,7 +52,7 @@ void kernel_main() {
     cb_push_back(cb_id_in2, 1);
 #endif
 
-    constexpr uint32_t cb_id_mask_h = tt::CB::c_in3;
+    constexpr uint32_t cb_id_mask_h = tt::CBIndex::c_3;
 #ifdef DO_MASK_H
     generate_mask_h(cb_id_mask_h, mask_h);
 #endif
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_nc.cpp
index e019826d34b..f6ca2f0d3e2 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_nc.cpp
@@ -17,9 +17,9 @@ void kernel_main() {
     const auto inner_size = get_arg_val<uint32_t>(i++);
 
     constexpr uint32_t onetile = 1;
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_id_in1 = tt::CB::c_in1;
-    constexpr uint32_t cb_id_in2 = tt::CB::c_in2;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1;
+    constexpr uint32_t cb_id_in2 = tt::CBIndex::c_2;
 
     union {
         float f;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_w.cpp
index 934d5a0e42b..575820eb7b4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/kernels/reader_moreh_mean_w.cpp
@@ -13,15 +13,15 @@ void kernel_main() {
     constexpr bool src_is_dram = get_compile_time_arg_val(0) == 1;
     constexpr uint32_t scaler = get_compile_time_arg_val(1);
 
-    constexpr uint32_t cb_id_in2 = tt::CB::c_in2;
+    constexpr uint32_t cb_id_in2 = tt::CBIndex::c_2;
     generate_mm_scaler(cb_id_in2, scaler);
 
-    constexpr uint32_t cb_id_mask_w = tt::CB::c_in3;
+    constexpr uint32_t cb_id_mask_w = tt::CBIndex::c_3;
 #ifdef DO_MASK_W
     generate_mask_w(cb_id_mask_w, mask_w);
 #endif
 
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
 
     // ublocks size defined in tiles
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_h_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_h_program_factory.cpp
index a771dd65e9d..c1691e88272 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_h_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_h_program_factory.cpp
@@ -68,12 +68,12 @@ MorehMeanOperation::MorehMeanHFactory::cached_program_t MorehMeanOperation::More
         all_cores,
         data_format,
         {
-            {CB::c_in0, num_input_tiles},                        // input
-            {CB::c_in2, 1},                                      // scaler
-            {CB::c_in3, 1},                                      // mask
-            {CB::c_intermed0, 1, fp32_dest_acc_en_data_format},  //
-            {CB::c_intermed1, 1},                                //
-            {CB::c_out0, 1},                                     // output
+            {CBIndex::c_0, num_input_tiles},                        // input
+            {CBIndex::c_2, 1},                                      // scaler
+            {CBIndex::c_3, 1},                                      // mask
+            {CBIndex::c_24, 1, fp32_dest_acc_en_data_format},  //
+            {CBIndex::c_25, 1},                                //
+            {CBIndex::c_16, 1},                                     // output
         });
 
     float scaler = 1.0f / origin_H;
@@ -95,7 +95,7 @@ MorehMeanOperation::MorehMeanHFactory::cached_program_t MorehMeanOperation::More
         reader_defines);
 
     std::vector<uint32_t> writer_compile_time_args = {
-        static_cast<uint32_t>(CB::c_out0), static_cast<uint32_t>(is_dram(output))};
+        static_cast<uint32_t>(CBIndex::c_16), static_cast<uint32_t>(is_dram(output))};
 
     const auto writer_kernel_id = CreateWriteKernel(
         program,
@@ -113,7 +113,7 @@ MorehMeanOperation::MorehMeanHFactory::cached_program_t MorehMeanOperation::More
     std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (fp32_dest_acc_en) {
         compute_defines["FP32_DEST_ACC_EN"] = 1;
-        unpack_to_dest_mode[tt::CB::c_intermed0] = UnpackToDestMode::UnpackToDestFp32;
+        unpack_to_dest_mode[tt::CBIndex::c_24] = UnpackToDestMode::UnpackToDestFp32;
     }
     std::vector<uint32_t> compute_kernel_args_group_1 = {
         Ht,                      // Ht
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
index 7e74d0a4dca..127f4b7d36a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_nc_program_factory.cpp
@@ -83,11 +83,11 @@ MorehMeanOperation::MorehMeanNCFactory::cached_program_t MorehMeanOperation::Mor
         all_cores,
         cb_data_format,
         {
-            {CB::c_in0, 2},        // input
-            {CB::c_in1, 1},        // zero
-            {CB::c_in2, 1},        // scaler
-            {CB::c_intermed0, 1},  // accumulated mean
-            {CB::c_out0, 2},       // output
+            {CBIndex::c_0, 2},        // input
+            {CBIndex::c_1, 1},        // zero
+            {CBIndex::c_2, 1},        // scaler
+            {CBIndex::c_24, 1},  // accumulated mean
+            {CBIndex::c_16, 2},       // output
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
index 053f536391c..150a4991330 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean/device/moreh_mean_w_program_factory.cpp
@@ -68,12 +68,12 @@ MorehMeanOperation::MorehMeanWFactory::cached_program_t MorehMeanOperation::More
         all_cores,
         data_format,
         {
-            {CB::c_in0, num_input_tiles},                        // input
-            {CB::c_in2, 1},                                      // scalar
-            {CB::c_in3, 1},                                      // mask
-            {CB::c_intermed0, 1, fp32_dest_acc_en_data_format},  //
-            {CB::c_intermed1, 1},                                //
-            {CB::c_out0, 1},                                     // output
+            {CBIndex::c_0, num_input_tiles},                        // input
+            {CBIndex::c_2, 1},                                      // scalar
+            {CBIndex::c_3, 1},                                      // mask
+            {CBIndex::c_24, 1, fp32_dest_acc_en_data_format},  //
+            {CBIndex::c_25, 1},                                //
+            {CBIndex::c_16, 1},                                     // output
         });
 
     float scaler = 1.0f / origin_W;
@@ -83,7 +83,7 @@ MorehMeanOperation::MorehMeanWFactory::cached_program_t MorehMeanOperation::More
     std::vector<uint32_t> reader_compile_time_args = {static_cast<uint32_t>(is_dram(input)), packed_scaler_value};
 
     std::vector<uint32_t> writer_compile_time_args = {
-        static_cast<uint32_t>(CB::c_out0), static_cast<uint32_t>(is_dram(output))};
+        static_cast<uint32_t>(CBIndex::c_16), static_cast<uint32_t>(is_dram(output))};
 
     std::map<string, string> reader_defines{};
     if (do_mask_w) {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp
index e8d405c8365..50d3d5c8786 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/moreh_mean_backward.cpp
@@ -16,15 +16,15 @@ void MAIN {
     constexpr bool wt_need_bcast = (get_compile_time_arg_val(1) == 1);
     constexpr bool ht_need_bcast = (get_compile_time_arg_val(2) == 1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;  // input
-    constexpr auto cb_in1 = tt::CB::c_in1;  // zero tile
-    constexpr auto cb_scalar = tt::CB::c_in2;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;  // input
+    constexpr auto cb_in1 = tt::CBIndex::c_1;  // zero tile
+    constexpr auto cb_scalar = tt::CBIndex::c_2;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_1);
     cb_wait_front(cb_in1, onetile);
     for (uint32_t i = 0; i < num_output_tiles; i++) {
         tile_regs_acquire();
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/reader_moreh_mean_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/reader_moreh_mean_backward.cpp
index 1bd4f6eb22d..000e52db1cb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/reader_moreh_mean_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/reader_moreh_mean_backward.cpp
@@ -68,9 +68,9 @@ void kernel_main() {
     }
 
     constexpr uint32_t onetile = 1;
-    constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_id_in1 = tt::CB::c_in1;
-    constexpr uint32_t cb_id_in2 = tt::CB::c_in2;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1;
+    constexpr uint32_t cb_id_in2 = tt::CBIndex::c_2;
 
     // zero tile
     union {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/writer_moreh_mean_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/writer_moreh_mean_backward.cpp
index 1481598985e..cdd7fbce66e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/writer_moreh_mean_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/kernels/writer_moreh_mean_backward.cpp
@@ -14,7 +14,7 @@ void kernel_main() {
     const auto num_tiles = arg_fetcher.get_next_arg_val<uint32_t>();
     const auto start_id = arg_fetcher.get_next_arg_val<uint32_t>();
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
 
     uint32_t input_grad_tile_bytes = get_tile_size(cb_id_out);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
index 440fcf7ccb7..e1a7080d64c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_mean_backward/device/moreh_mean_backward_program_factory.cpp
@@ -12,6 +12,8 @@
 #include "ttnn/operations/reduction/generic/device/common.hpp"
 #include "ttnn/operations/reduction/generic/device/reduce_op.hpp"
 
+using namespace tt::tt_metal;
+
 void get_tensor_dim(ttnn::SmallVector<uint32_t> &dim, const tt::tt_metal::LegacyShape &shape) {
     const auto rank = shape.rank();
     for (auto i = 0; i < rank; ++i) {
@@ -124,11 +126,11 @@ MorehMeanBackwardOperation::MorehMeanBackwardFactory::create(
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, 2},  // input
-            {tt::CB::c_in1, 1},  // zero
-            {tt::CB::c_in2, 1},  // scalar
-            {tt::CB::c_intermed0, 1},
-            {tt::CB::c_out0, 2},  // output
+            {tt::CBIndex::c_0, 2},  // input
+            {tt::CBIndex::c_1, 1},  // zero
+            {tt::CBIndex::c_2, 1},  // scalar
+            {tt::CBIndex::c_24, 1},
+            {tt::CBIndex::c_16, 2},  // output
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp
index bc141ff8f44..ec2b103d983 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1.cpp
@@ -17,10 +17,10 @@ void kernel_main() {
     auto element_size = get_arg_val<uint32_t>(i++);
     auto target_element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_target = tt::CB::c_in0;
-    constexpr uint32_t cb_weight = tt::CB::c_in1;
+    constexpr uint32_t cb_target = tt::CBIndex::c_0;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_1;
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     const uint32_t target_tile_bytes = get_tile_size(cb_target);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp
index 1f0ccc88cc5..00fb316bfdb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/reader_moreh_nll_loss_step1_large.cpp
@@ -17,10 +17,10 @@ void kernel_main() {
     auto element_size = get_arg_val<uint32_t>(i++);
     auto target_element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_target = tt::CB::c_in0;
-    constexpr uint32_t cb_weight = tt::CB::c_in1;
+    constexpr uint32_t cb_target = tt::CBIndex::c_0;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_1;
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     const uint32_t target_tile_bytes = get_tile_size(cb_target);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/writer_moreh_nll_loss_step1.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/writer_moreh_nll_loss_step1.cpp
index 04004b9b6b4..db627353207 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/writer_moreh_nll_loss_step1.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/kernels/writer_moreh_nll_loss_step1.cpp
@@ -9,7 +9,7 @@ void kernel_main() {
     uint32_t num_units_per_core = get_arg_val<uint32_t>(1);
     uint32_t start_id = get_arg_val<uint32_t>(2);
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     const uint32_t output_tile_bytes = get_tile_size(cb_output);
     const auto output_data_format = get_dataformat(cb_output);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
index a6e198568a8..4f4d3fa9c73 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step1/device/moreh_nll_loss_step1_program_factory.cpp
@@ -80,10 +80,10 @@ MorehNllLossStep1DeviceOperation::Factory::cached_program_t MorehNllLossStep1Dev
             all_cores,
             data_format,
             {
-                {CB::c_in0, 1, tt::DataFormat::Int32},       // target
-                {CB::c_in1, 1},                              // weight
-                {CB::c_intermed0, 1, intermed_data_format},  // tmp_weight
-                {CB::c_out0, 1},                             // output
+                {CBIndex::c_0, 1, tt::DataFormat::Int32},       // target
+                {CBIndex::c_1, 1},                              // weight
+                {CBIndex::c_24, 1, intermed_data_format},  // tmp_weight
+                {CBIndex::c_16, 1},                             // output
             });
     } else {
         CreateCircularBuffer(
@@ -91,10 +91,10 @@ MorehNllLossStep1DeviceOperation::Factory::cached_program_t MorehNllLossStep1Dev
             all_cores,
             data_format,
             {
-                {CB::c_in0, 1, tt::DataFormat::Int32},       // target
-                {CB::c_in1, weight_num_tile},                // weight
-                {CB::c_intermed0, 1, intermed_data_format},  // tmp_weight
-                {CB::c_out0, 1},                             // output
+                {CBIndex::c_0, 1, tt::DataFormat::Int32},       // target
+                {CBIndex::c_1, weight_num_tile},                // weight
+                {CBIndex::c_24, 1, intermed_data_format},  // tmp_weight
+                {CBIndex::c_16, 1},                             // output
             });
     }
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/moreh_nll_loss_step2_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/moreh_nll_loss_step2_kernel.cpp
index e4534c71aaf..7d4359c5041 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/moreh_nll_loss_step2_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/moreh_nll_loss_step2_kernel.cpp
@@ -10,16 +10,16 @@ namespace NAMESPACE {
 void MAIN {
     constexpr uint32_t per_core_tile_cnt = get_compile_time_arg_val(0);
 
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
-    constexpr uint32_t cb_divisor = tt::CB::c_in3;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
+    constexpr uint32_t cb_divisor = tt::CBIndex::c_3;
 
-    constexpr uint32_t cb_tmp_weight = tt::CB::c_intermed0;
-    constexpr uint32_t cb_tmp_input = tt::CB::c_intermed1;
-    constexpr uint32_t cb_tmp1 = tt::CB::c_intermed2;
-    constexpr uint32_t cb_divisor_recip = tt::CB::c_intermed3;  // 1/divisor
-    constexpr uint32_t cb_tmp3 = tt::CB::c_intermed4;
+    constexpr uint32_t cb_tmp_weight = tt::CBIndex::c_24;
+    constexpr uint32_t cb_tmp_input = tt::CBIndex::c_25;
+    constexpr uint32_t cb_tmp1 = tt::CBIndex::c_26;
+    constexpr uint32_t cb_divisor_recip = tt::CBIndex::c_27;  // 1/divisor
+    constexpr uint32_t cb_tmp3 = tt::CBIndex::c_28;
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_2d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_2d.cpp
index e6adbec7529..ae136b1e1d0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_2d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_2d.cpp
@@ -17,15 +17,15 @@ void kernel_main() {
     auto C = get_arg_val<uint32_t>(i++);
     auto element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_input = tt::CB::c_in0;
-    constexpr uint32_t cb_target = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
-    constexpr uint32_t cb_divisor = tt::CB::c_in3;
+    constexpr uint32_t cb_input = tt::CBIndex::c_0;
+    constexpr uint32_t cb_target = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
+    constexpr uint32_t cb_divisor = tt::CBIndex::c_3;
 
-    constexpr uint32_t cb_tmp_weight = tt::CB::c_intermed0;
-    constexpr uint32_t cb_tmp_input = tt::CB::c_intermed1;
+    constexpr uint32_t cb_tmp_weight = tt::CBIndex::c_24;
+    constexpr uint32_t cb_tmp_input = tt::CBIndex::c_25;
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     const uint32_t input_tile_bytes = get_tile_size(cb_input);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_3d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_3d.cpp
index a0ee0bbdbea..601c843543a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_3d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_3d.cpp
@@ -18,15 +18,15 @@ void kernel_main() {
     auto W = get_arg_val<uint32_t>(i++);
     auto element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_input = tt::CB::c_in0;
-    constexpr uint32_t cb_target = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
-    constexpr uint32_t cb_divisor = tt::CB::c_in3;
+    constexpr uint32_t cb_input = tt::CBIndex::c_0;
+    constexpr uint32_t cb_target = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
+    constexpr uint32_t cb_divisor = tt::CBIndex::c_3;
 
-    constexpr uint32_t cb_tmp_weight = tt::CB::c_intermed0;
-    constexpr uint32_t cb_tmp_input = tt::CB::c_intermed1;
+    constexpr uint32_t cb_tmp_weight = tt::CBIndex::c_24;
+    constexpr uint32_t cb_tmp_input = tt::CBIndex::c_25;
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     const uint32_t input_tile_bytes = get_tile_size(cb_input);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_4d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_4d.cpp
index 0dae1d69769..eb1f194dc72 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_4d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/reader_moreh_nll_loss_step2_4d.cpp
@@ -20,15 +20,15 @@ void kernel_main() {
     auto weight_num_tile = get_arg_val<uint32_t>(i++);
     auto element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_input = tt::CB::c_in0;
-    constexpr uint32_t cb_target = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
-    constexpr uint32_t cb_divisor = tt::CB::c_in3;
+    constexpr uint32_t cb_input = tt::CBIndex::c_0;
+    constexpr uint32_t cb_target = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
+    constexpr uint32_t cb_divisor = tt::CBIndex::c_3;
 
-    constexpr uint32_t cb_tmp_weight = tt::CB::c_intermed0;
-    constexpr uint32_t cb_tmp_input = tt::CB::c_intermed1;
+    constexpr uint32_t cb_tmp_weight = tt::CBIndex::c_24;
+    constexpr uint32_t cb_tmp_input = tt::CBIndex::c_25;
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     const uint32_t input_tile_bytes = get_tile_size(cb_input);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_2d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_2d.cpp
index ac0140d7df7..bd71edf703a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_2d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_2d.cpp
@@ -9,7 +9,7 @@ void kernel_main() {
     auto num_tiles_per_core = get_arg_val<uint32_t>(1);
     auto start_id = get_arg_val<uint32_t>(2);
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     const uint32_t output_tile_bytes = get_tile_size(cb_output);
     const auto output_data_format = get_dataformat(cb_output);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_3d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_3d.cpp
index e475e42140c..b781db3bee0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_3d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_3d.cpp
@@ -13,7 +13,7 @@ void kernel_main() {
     auto W = get_arg_val<uint32_t>(i++);
     auto element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
 
     constexpr bool output_is_dram = get_compile_time_arg_val(0) == 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_4d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_4d.cpp
index 1b55208d801..9379cae27be 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_4d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/kernels/writer_moreh_nll_loss_step2_4d.cpp
@@ -11,7 +11,7 @@ void kernel_main() {
     auto num_tiles_per_core = get_arg_val<uint32_t>(i++);
     auto start_id = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_output = tt::CB::c_out0;
+    constexpr uint32_t cb_output = tt::CBIndex::c_16;
     const auto output_data_format = get_dataformat(cb_output);
     const uint32_t output_tile_bytes = get_tile_size(cb_output);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.cpp
index 61657a2dd7a..0f18410fd00 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_device_operation.cpp
@@ -4,6 +4,8 @@
 
 #include "moreh_nll_loss_step2_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_nll_loss_step2 {
 
 MorehNllLossStep2DeviceOperation::program_factory_t MorehNllLossStep2DeviceOperation::select_program_factory(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
index ec156a48c89..f3af30ab7f5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss/moreh_nll_loss_step2/device/moreh_nll_loss_step2_program_factory.cpp
@@ -61,16 +61,16 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
         all_cores,
         data_format,
         {
-            {CB::c_in0, 1},                                                 // input
-            {CB::c_in1, 1, tt::DataFormat::Int32},                          // target
-            {CB::c_in2, static_cast<uint32_t>(weight_has_value ? 1 : 0)},   // weight
-            {CB::c_in3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},  // divisor
-            {CB::c_intermed0, 1, fp32_dest_acc_en_data_format},             // tmp_weight to reduce
-            {CB::c_intermed1, 1, fp32_dest_acc_en_data_format},             // tmp_input to reduce
-            {CB::c_intermed2, 1, fp32_dest_acc_en_data_format},             // tmp1
-            {CB::c_intermed3, 1, fp32_dest_acc_en_data_format},             // tmp2
-            {CB::c_intermed4, 1, fp32_dest_acc_en_data_format},             // tmp3
-            {CB::c_out0, 1},                                                // output
+            {CBIndex::c_0, 1},                                                 // input
+            {CBIndex::c_1, 1, tt::DataFormat::Int32},                          // target
+            {CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? 1 : 0)},   // weight
+            {CBIndex::c_3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},  // divisor
+            {CBIndex::c_24, 1, fp32_dest_acc_en_data_format},             // tmp_weight to reduce
+            {CBIndex::c_25, 1, fp32_dest_acc_en_data_format},             // tmp_input to reduce
+            {CBIndex::c_26, 1, fp32_dest_acc_en_data_format},             // tmp1
+            {CBIndex::c_27, 1, fp32_dest_acc_en_data_format},             // tmp2
+            {CBIndex::c_28, 1, fp32_dest_acc_en_data_format},             // tmp3
+            {CBIndex::c_16, 1},                                                // output
         });
 
     // create read/wrtie kernel
@@ -240,16 +240,16 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
         all_cores,
         data_format,
         {
-            {CB::c_in0, 1},                                                 // input
-            {CB::c_in1, 1, tt::DataFormat::Int32},                          // target
-            {CB::c_in2, static_cast<uint32_t>(weight_has_value ? 1 : 0)},   // weight
-            {CB::c_in3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},  // divisor
-            {CB::c_intermed0, 1, fp32_dest_acc_en_data_format},             // tmp_weight to reduce
-            {CB::c_intermed1, 1, fp32_dest_acc_en_data_format},             // tmp_input to reduce
-            {CB::c_intermed2, 1, fp32_dest_acc_en_data_format},             // tmp1
-            {CB::c_intermed3, 1, fp32_dest_acc_en_data_format},             // tmp2
-            {CB::c_intermed4, 1, fp32_dest_acc_en_data_format},             // tmp3
-            {CB::c_out0, 1},                                                // output
+            {CBIndex::c_0, 1},                                                 // input
+            {CBIndex::c_1, 1, tt::DataFormat::Int32},                          // target
+            {CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? 1 : 0)},   // weight
+            {CBIndex::c_3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},  // divisor
+            {CBIndex::c_24, 1, fp32_dest_acc_en_data_format},             // tmp_weight to reduce
+            {CBIndex::c_25, 1, fp32_dest_acc_en_data_format},             // tmp_input to reduce
+            {CBIndex::c_26, 1, fp32_dest_acc_en_data_format},             // tmp1
+            {CBIndex::c_27, 1, fp32_dest_acc_en_data_format},             // tmp2
+            {CBIndex::c_28, 1, fp32_dest_acc_en_data_format},             // tmp3
+            {CBIndex::c_16, 1},                                                // output
         });
 
     // create read/wrtie kernel
@@ -429,16 +429,16 @@ MorehNllLossStep2DeviceOperation::Factory::cached_program_t moreh_nll_loss_step2
         all_cores,
         data_format,
         {
-            {CB::c_in0, 1},                                                              // input
-            {CB::c_in1, 1, tt::DataFormat::Int32},                                       // target
-            {CB::c_in2, static_cast<uint32_t>(weight_has_value ? weight_num_tile : 0)},  // weight
-            {CB::c_in3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},               // divisor
-            {CB::c_intermed0, 1, fp32_dest_acc_en_data_format},                          // tmp_weight to reduce
-            {CB::c_intermed1, 1, fp32_dest_acc_en_data_format},                          // tmp_input to reduce
-            {CB::c_intermed2, 1, fp32_dest_acc_en_data_format},                          // tmp1
-            {CB::c_intermed3, 1, fp32_dest_acc_en_data_format},                          // tmp2
-            {CB::c_intermed4, 1, fp32_dest_acc_en_data_format},                          // tmp3
-            {CB::c_out0, 1},                                                             // output
+            {CBIndex::c_0, 1},                                                              // input
+            {CBIndex::c_1, 1, tt::DataFormat::Int32},                                       // target
+            {CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? weight_num_tile : 0)},  // weight
+            {CBIndex::c_3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},               // divisor
+            {CBIndex::c_24, 1, fp32_dest_acc_en_data_format},                          // tmp_weight to reduce
+            {CBIndex::c_25, 1, fp32_dest_acc_en_data_format},                          // tmp_input to reduce
+            {CBIndex::c_26, 1, fp32_dest_acc_en_data_format},                          // tmp1
+            {CBIndex::c_27, 1, fp32_dest_acc_en_data_format},                          // tmp2
+            {CBIndex::c_28, 1, fp32_dest_acc_en_data_format},                          // tmp3
+            {CBIndex::c_16, 1},                                                             // output
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/moreh_nll_loss_backward_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/moreh_nll_loss_backward_kernel.cpp
index a6edea97a6d..74e6d030de4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/moreh_nll_loss_backward_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/moreh_nll_loss_backward_kernel.cpp
@@ -13,17 +13,17 @@ void MAIN {
 
     const uint32_t tile_offset = get_arg_val<uint32_t>(1);
 
-    constexpr uint32_t cb_divisor = tt::CB::c_in3;
-    constexpr uint32_t cb_output_grad = tt::CB::c_in0;
-    constexpr uint32_t cb_tmp_weight = tt::CB::c_intermed0;
-    constexpr uint32_t cb_tmp1 = tt::CB::c_intermed1;
-    constexpr uint32_t cb_tmp2 = tt::CB::c_intermed2;
-    constexpr uint32_t cb_input_grad = tt::CB::c_out0;
+    constexpr uint32_t cb_divisor = tt::CBIndex::c_3;
+    constexpr uint32_t cb_output_grad = tt::CBIndex::c_0;
+    constexpr uint32_t cb_tmp_weight = tt::CBIndex::c_24;
+    constexpr uint32_t cb_tmp1 = tt::CBIndex::c_25;
+    constexpr uint32_t cb_tmp2 = tt::CBIndex::c_26;
+    constexpr uint32_t cb_input_grad = tt::CBIndex::c_16;
 
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t onetile = 1;
 
-    init_sfpu(cb_output_grad);
+    init_sfpu(cb_output_grad, tt::CBIndex::c_16);
 
 #if defined(DIVISOR)
         cb_wait_front(cb_divisor, onetile);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_2d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_2d.cpp
index 4b1c5136b4d..5521fe3cddd 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_2d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_2d.cpp
@@ -17,12 +17,12 @@ void kernel_main() {
     auto weight_num_tile = get_arg_val<uint32_t>(i++);
     auto element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_output_grad = tt::CB::c_in0;
-    constexpr uint32_t cb_target = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
-    constexpr uint32_t cb_divisor = tt::CB::c_in3;
+    constexpr uint32_t cb_output_grad = tt::CBIndex::c_0;
+    constexpr uint32_t cb_target = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
+    constexpr uint32_t cb_divisor = tt::CBIndex::c_3;
 
-    constexpr uint32_t cb_tmp_weight = tt::CB::c_intermed0;
+    constexpr uint32_t cb_tmp_weight = tt::CBIndex::c_24;
 
     // ublocks size defined in tiles
     const uint32_t weight_tile_bytes = get_tile_size(cb_weight);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_3d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_3d.cpp
index c0c3e92f002..915c05deefc 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_3d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_3d.cpp
@@ -18,12 +18,12 @@ void kernel_main() {
     auto weight_num_tile = get_arg_val<uint32_t>(i++);
     auto element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_output_grad = tt::CB::c_in0;
-    constexpr uint32_t cb_target = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
-    constexpr uint32_t cb_divisor = tt::CB::c_in3;
+    constexpr uint32_t cb_output_grad = tt::CBIndex::c_0;
+    constexpr uint32_t cb_target = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
+    constexpr uint32_t cb_divisor = tt::CBIndex::c_3;
 
-    constexpr uint32_t cb_tmp_weight = tt::CB::c_intermed0;
+    constexpr uint32_t cb_tmp_weight = tt::CBIndex::c_24;
 
     // ublocks size defined in tiles
     const uint32_t weight_tile_bytes = get_tile_size(cb_weight);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_4d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_4d.cpp
index 7871aeb2d21..1b52d70f85f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_4d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/reader_moreh_nll_loss_backward_4d.cpp
@@ -19,12 +19,12 @@ void kernel_main() {
     auto weight_num_tile = get_arg_val<uint32_t>(i++);
     auto element_size = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_output_grad = tt::CB::c_in0;
-    constexpr uint32_t cb_target = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
-    constexpr uint32_t cb_divisor = tt::CB::c_in3;
+    constexpr uint32_t cb_output_grad = tt::CBIndex::c_0;
+    constexpr uint32_t cb_target = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
+    constexpr uint32_t cb_divisor = tt::CBIndex::c_3;
 
-    constexpr uint32_t cb_tmp_weight = tt::CB::c_intermed0;
+    constexpr uint32_t cb_tmp_weight = tt::CBIndex::c_24;
 
     // ublocks size defined in tiles
     const uint32_t weight_tile_bytes = get_tile_size(cb_weight);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/writer_moreh_nll_loss_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/writer_moreh_nll_loss_backward.cpp
index 0669f261dcd..dc48a1e727d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/writer_moreh_nll_loss_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/kernels/writer_moreh_nll_loss_backward.cpp
@@ -10,7 +10,7 @@ void kernel_main() {
     auto num_tiles_per_core = get_arg_val<uint32_t>(i++);
     auto start_id = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_input_grad = tt::CB::c_out0;
+    constexpr uint32_t cb_input_grad = tt::CBIndex::c_16;
 
     const uint32_t input_grad_tile_bytes = get_tile_size(cb_input_grad);
     const auto input_grad_data_format = get_dataformat(cb_input_grad);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_program_factory.cpp
index 2e321f93b53..2b42688b316 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_backward/device/moreh_nll_loss_backward_program_factory.cpp
@@ -59,14 +59,14 @@ MorehNllLossBackwardDeviceOperation::Factory::cached_program_t moreh_nll_loss_ba
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 1},                                                              // output_grad
-            {tt::CB::c_in1, 1, tt::DataFormat::Int32},                                       // target
-            {tt::CB::c_in2, static_cast<uint32_t>(weight_has_value ? weight_num_tile : 0)},  // weight
-            {tt::CB::c_in3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},               // divisor
-            {tt::CB::c_intermed0, 1, fp32_dest_acc_en_data_format},                          // tmp_weight
-            {tt::CB::c_intermed1, 1, fp32_dest_acc_en_data_format},                          // tmp1
-            {tt::CB::c_intermed2, 1, fp32_dest_acc_en_data_format},                          // tmp2
-            {tt::CB::c_out0, 1},                                                             // input_grad
+            {tt::CBIndex::c_0, 1},                                                              // output_grad
+            {tt::CBIndex::c_1, 1, tt::DataFormat::Int32},                                       // target
+            {tt::CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? weight_num_tile : 0)},  // weight
+            {tt::CBIndex::c_3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},               // divisor
+            {tt::CBIndex::c_24, 1, fp32_dest_acc_en_data_format},                          // tmp_weight
+            {tt::CBIndex::c_25, 1, fp32_dest_acc_en_data_format},                          // tmp1
+            {tt::CBIndex::c_26, 1, fp32_dest_acc_en_data_format},                          // tmp2
+            {tt::CBIndex::c_16, 1},                                                             // input_grad
         });
 
     // create read/wrtie kernel
@@ -235,14 +235,14 @@ MorehNllLossBackwardDeviceOperation::Factory::cached_program_t moreh_nll_loss_ba
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 1},                                                              // output_grad
-            {tt::CB::c_in1, 1, tt::DataFormat::Int32},                                       // target
-            {tt::CB::c_in2, static_cast<uint32_t>(weight_has_value ? weight_num_tile : 0)},  // weight
-            {tt::CB::c_in3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},               // divisor
-            {tt::CB::c_intermed0, 1, fp32_dest_acc_en_data_format},                          // tmp_weight
-            {tt::CB::c_intermed1, 1, fp32_dest_acc_en_data_format},                          // tmp1
-            {tt::CB::c_intermed2, 1, fp32_dest_acc_en_data_format},                          // tmp2
-            {tt::CB::c_out0, 1},                                                             // input_grad
+            {tt::CBIndex::c_0, 1},                                                              // output_grad
+            {tt::CBIndex::c_1, 1, tt::DataFormat::Int32},                                       // target
+            {tt::CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? weight_num_tile : 0)},  // weight
+            {tt::CBIndex::c_3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},               // divisor
+            {tt::CBIndex::c_24, 1, fp32_dest_acc_en_data_format},                          // tmp_weight
+            {tt::CBIndex::c_25, 1, fp32_dest_acc_en_data_format},                          // tmp1
+            {tt::CBIndex::c_26, 1, fp32_dest_acc_en_data_format},                          // tmp2
+            {tt::CBIndex::c_16, 1},                                                             // input_grad
         });
 
     // create read/wrtie kernel
@@ -409,14 +409,14 @@ MorehNllLossBackwardDeviceOperation::Factory::cached_program_t moreh_nll_loss_ba
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 1},                                                              // output_grad
-            {tt::CB::c_in1, 1, tt::DataFormat::Int32},                                       // target
-            {tt::CB::c_in2, static_cast<uint32_t>(weight_has_value ? weight_num_tile : 0)},  // weight
-            {tt::CB::c_in3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},               // divisor
-            {tt::CB::c_intermed0, 1, fp32_dest_acc_en_data_format},                          // tmp_weight
-            {tt::CB::c_intermed1, 1, fp32_dest_acc_en_data_format},                          // tmp1
-            {tt::CB::c_intermed2, 1, fp32_dest_acc_en_data_format},                          // tmp2
-            {tt::CB::c_out0, 1},                                                             // input_grad
+            {tt::CBIndex::c_0, 1},                                                              // output_grad
+            {tt::CBIndex::c_1, 1, tt::DataFormat::Int32},                                       // target
+            {tt::CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? weight_num_tile : 0)},  // weight
+            {tt::CBIndex::c_3, static_cast<uint32_t>(divisor_has_value ? 1 : 0)},               // divisor
+            {tt::CBIndex::c_24, 1, fp32_dest_acc_en_data_format},                          // tmp_weight
+            {tt::CBIndex::c_25, 1, fp32_dest_acc_en_data_format},                          // tmp1
+            {tt::CBIndex::c_26, 1, fp32_dest_acc_en_data_format},                          // tmp2
+            {tt::CBIndex::c_16, 1},                                                             // input_grad
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_2d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_2d.cpp
index 3f37d8d66b8..8974a9fad4d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_2d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_2d.cpp
@@ -16,11 +16,11 @@ void kernel_main() {
     auto C = get_arg_val<uint32_t>(i++);
     auto Ct = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_target = tt::CB::c_in0;
-    constexpr uint32_t cb_output_grad = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
+    constexpr uint32_t cb_target = tt::CBIndex::c_0;
+    constexpr uint32_t cb_output_grad = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
 
-    constexpr uint32_t cb_input_grad = tt::CB::c_out0;
+    constexpr uint32_t cb_input_grad = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     const uint32_t target_tile_bytes = get_tile_size(cb_target);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_3d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_3d.cpp
index 90ce7c4c09e..c9a196c7302 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_3d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_3d.cpp
@@ -16,11 +16,11 @@ void kernel_main() {
     auto Ct = get_arg_val<uint32_t>(i++);
     auto Wt = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_target = tt::CB::c_in0;
-    constexpr uint32_t cb_output_grad = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
+    constexpr uint32_t cb_target = tt::CBIndex::c_0;
+    constexpr uint32_t cb_output_grad = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
 
-    constexpr uint32_t cb_input_grad = tt::CB::c_out0;
+    constexpr uint32_t cb_input_grad = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     const uint32_t target_tile_bytes = get_tile_size(cb_target);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_4d.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_4d.cpp
index 7eaaa6565d2..dfae0dedd82 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_4d.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/reader_moreh_nll_loss_unreduced_backward_4d.cpp
@@ -16,11 +16,11 @@ void kernel_main() {
     auto C = get_arg_val<uint32_t>(i++);
     auto Ct = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_target = tt::CB::c_in0;
-    constexpr uint32_t cb_output_grad = tt::CB::c_in1;
-    constexpr uint32_t cb_weight = tt::CB::c_in2;
+    constexpr uint32_t cb_target = tt::CBIndex::c_0;
+    constexpr uint32_t cb_output_grad = tt::CBIndex::c_1;
+    constexpr uint32_t cb_weight = tt::CBIndex::c_2;
 
-    constexpr uint32_t cb_input_grad = tt::CB::c_out0;
+    constexpr uint32_t cb_input_grad = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     const uint32_t target_tile_bytes = get_tile_size(cb_target);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/writer_moreh_nll_loss_unreduced_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/writer_moreh_nll_loss_unreduced_backward.cpp
index 08b9968e36e..1711c8362b2 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/writer_moreh_nll_loss_unreduced_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/kernels/writer_moreh_nll_loss_unreduced_backward.cpp
@@ -10,7 +10,7 @@ void kernel_main() {
     auto num_tiles_per_core = get_arg_val<uint32_t>(i++);
     auto start_id = get_arg_val<uint32_t>(i++);
 
-    constexpr uint32_t cb_input_grad = tt::CB::c_out0;
+    constexpr uint32_t cb_input_grad = tt::CBIndex::c_16;
 
     const uint32_t input_grad_tile_bytes = get_tile_size(cb_input_grad);
     const auto input_grad_data_format = get_dataformat(cb_input_grad);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_program_factory.cpp
index 60772b3af2c..a649123e763 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_nll_loss_unreduced_backward/device/moreh_nll_loss_unreduced_backward_program_factory.cpp
@@ -55,10 +55,10 @@ MorehNllLossUnreducedBackwardDeviceOperation::Factory::cached_program_t moreh_nl
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 1, tt::DataFormat::Int32},                          // target
-            {tt::CB::c_in1, Nt},                                                // output_grad
-            {tt::CB::c_in2, static_cast<uint32_t>(weight_has_value ? Ct : 0)},  // weight
-            {tt::CB::c_out0, 1},                                                // input_grad
+            {tt::CBIndex::c_0, 1, tt::DataFormat::Int32},                          // target
+            {tt::CBIndex::c_1, Nt},                                                // output_grad
+            {tt::CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? Ct : 0)},  // weight
+            {tt::CBIndex::c_16, 1},                                                // input_grad
         });
 
     // create read/wrtie kernel
@@ -183,10 +183,10 @@ MorehNllLossUnreducedBackwardDeviceOperation::Factory::cached_program_t moreh_nl
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 1, tt::DataFormat::Int32},                          // target
-            {tt::CB::c_in1, 1},                                                 // output_grad
-            {tt::CB::c_in2, static_cast<uint32_t>(weight_has_value ? Ct : 0)},  // weight
-            {tt::CB::c_out0, 1},                                                // input_grad
+            {tt::CBIndex::c_0, 1, tt::DataFormat::Int32},                          // target
+            {tt::CBIndex::c_1, 1},                                                 // output_grad
+            {tt::CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? Ct : 0)},  // weight
+            {tt::CBIndex::c_16, 1},                                                // input_grad
         });
 
     // create read/wrtie kernel
@@ -310,10 +310,10 @@ MorehNllLossUnreducedBackwardDeviceOperation::Factory::cached_program_t moreh_nl
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 1, tt::DataFormat::Int32},                          // target
-            {tt::CB::c_in1, 1},                                                 // output_grad
-            {tt::CB::c_in2, static_cast<uint32_t>(weight_has_value ? Ct : 0)},  // weight
-            {tt::CB::c_out0, 1},                                                // input_grad
+            {tt::CBIndex::c_0, 1, tt::DataFormat::Int32},                          // target
+            {tt::CBIndex::c_1, 1},                                                 // output_grad
+            {tt::CBIndex::c_2, static_cast<uint32_t>(weight_has_value ? Ct : 0)},  // weight
+            {tt::CBIndex::c_16, 1},                                                // input_grad
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.cpp
index ff20005b8d1..77e8b07b481 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.cpp
@@ -108,12 +108,13 @@ MorehNormOperation::program_factory_t MorehNormOperation::select_program_factory
     const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
     const auto dim = operation_attributes.dim;
     const auto input_rank = tensor_args.input.get_legacy_shape().rank();
+    auto INF = std::numeric_limits<float>::infinity();
     if (dim == input_rank - 1)
-        return ProgramFactoryW{};
+        return ProgramFactoryWOther{};
     else if (dim == input_rank - 2)
-        return ProgramFactoryH{};
+        return ProgramFactoryHOther{};
     else
-        return ProgramFactoryOther{};
+        return ProgramFactoryNCOther{};
 }
 
 void MorehNormOperation::validate_on_program_cache_miss(
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp
index ffa38825da0..8c83cbe8900 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp
@@ -48,11 +48,11 @@ struct MorehNormOperation {
     using shape_return_value_t = Shape;
     using tensor_return_value_t = Tensor;
 
-    DEFINE_PROGRAM_FACTORY(ProgramFactoryW)
-    DEFINE_PROGRAM_FACTORY(ProgramFactoryH)
-    DEFINE_PROGRAM_FACTORY(ProgramFactoryOther)
+    DEFINE_PROGRAM_FACTORY(ProgramFactoryWOther)
+    DEFINE_PROGRAM_FACTORY(ProgramFactoryHOther)
+    DEFINE_PROGRAM_FACTORY(ProgramFactoryNCOther)
 
-    using program_factory_t = std::variant<ProgramFactoryW, ProgramFactoryH, ProgramFactoryOther>;
+    using program_factory_t = std::variant<ProgramFactoryWOther, ProgramFactoryHOther, ProgramFactoryNCOther>;
 
     static void validate_inputs(const operation_attributes_t&, const tensor_args_t&);
     static program_factory_t select_program_factory(const operation_attributes_t&, const tensor_args_t&);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
index 33918102522..d1559dac39b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
@@ -15,17 +15,17 @@ void MAIN {
     const auto recip_p = get_arg_val<uint32_t>(i++);
     const bool recip_p_is_negative = get_arg_val<uint32_t>(i++) == 1;
 
-    std::uint8_t input_id{tt::CB::c_in0};
+    std::uint8_t input_id{tt::CBIndex::c_0};
     const auto cb_x = input_id++;                // input
     const auto cb_one = input_id++;              // one
     const auto cb_decimal = input_id++;          // decimal
     const auto cb_recip_p_decimal = input_id++;  // recip_p_decimal
     const auto cb_mask_h = input_id++;           // mask_h
 
-    std::uint8_t output_id{tt::CB::c_out0};
+    std::uint8_t output_id{tt::CBIndex::c_16};
     const auto cb_y = output_id++;  // output
 
-    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    std::uint8_t intermed_id{tt::CBIndex::c_24};
     const auto cb_tmp0 = intermed_id++;
     const auto cb_tmp1 = intermed_id++;
     const auto cb_tmp2 = intermed_id++;
@@ -46,7 +46,7 @@ void MAIN {
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0);
 
     cb_wait_front(cb_one, onetile);              // comes from the reader
     cb_wait_front(cb_decimal, onetile);          // comes from the reader
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/moreh_norm_other_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/moreh_norm_other_kernel.cpp
index f864c9b51ea..a8f26205e5e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/moreh_norm_other_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/moreh_norm_other_kernel.cpp
@@ -14,16 +14,16 @@ void MAIN {
     const auto recip_p = get_arg_val<uint32_t>(i++);
     const bool recip_p_is_negative = get_arg_val<uint32_t>(i++) == 1;
 
-    std::uint8_t input_id{tt::CB::c_in0};
+    std::uint8_t input_id{tt::CBIndex::c_0};
     const auto cb_x = input_id++;                // input
     const auto cb_one = input_id++;              // one
     const auto cb_decimal = input_id++;          // decimal
     const auto cb_recip_p_decimal = input_id++;  // recip_p_decimal
 
-    std::uint8_t output_id{tt::CB::c_out0};
+    std::uint8_t output_id{tt::CBIndex::c_16};
     const auto cb_y = output_id++;  // output
 
-    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    std::uint8_t intermed_id{tt::CBIndex::c_24};
     const auto cb_tmp0 = intermed_id++;
     const auto cb_tmp1 = intermed_id++;
     const auto cb_tmp2 = intermed_id++;
@@ -42,7 +42,7 @@ void MAIN {
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0);
 
     cb_wait_front(cb_one, onetile);              // comes from the reader
     cb_wait_front(cb_decimal, onetile);          // comes from the reader
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
index 144774be26f..9212060930a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
@@ -15,17 +15,17 @@ void MAIN {
     const auto recip_p = get_arg_val<uint32_t>(i++);
     const bool recip_p_is_negative = get_arg_val<uint32_t>(i++) == 1;
 
-    std::uint8_t input_id{tt::CB::c_in0};
+    std::uint8_t input_id{tt::CBIndex::c_0};
     const auto cb_x = input_id++;                // input
     const auto cb_one = input_id++;              // one
     const auto cb_decimal = input_id++;          // decimal
     const auto cb_recip_p_decimal = input_id++;  // recip_p_decimal
     const auto cb_mask_w = input_id++;           // mask_w
 
-    std::uint8_t output_id{tt::CB::c_out0};
+    std::uint8_t output_id{tt::CBIndex::c_16};
     const auto cb_y = output_id++;  // output
 
-    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    std::uint8_t intermed_id{tt::CBIndex::c_24};
     const auto cb_tmp0 = intermed_id++;
     const auto cb_tmp1 = intermed_id++;
     const auto cb_tmp2 = intermed_id++;
@@ -46,7 +46,7 @@ void MAIN {
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0);
 
     cb_wait_front(cb_one, onetile);              // comes from the reader
     cb_wait_front(cb_decimal, onetile);          // comes from the reader
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
new file mode 100644
index 00000000000..dc5646e129c
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/moreh_norm_h_kernel.cpp
@@ -0,0 +1,178 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+
+namespace NAMESPACE {
+void MAIN {
+    int i{0};
+    const auto num_cols_per_core = get_arg_val<uint32_t>(i++);
+    const auto Ht = get_arg_val<uint32_t>(i++);
+    const auto origin_h = get_arg_val<uint32_t>(i++);
+
+    std::uint8_t input_id{tt::CB::c_in0};
+    const auto cb_x = input_id++;       // input
+    const auto cb_one = input_id++;     // one
+    const auto cb_mask_h = input_id++;  // mask_h
+
+    std::uint8_t output_id{tt::CB::c_out0};
+    const auto cb_y = output_id++;  // output
+
+    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    const auto cb_tmp0 = intermed_id++;
+    const auto cb_tmp1 = intermed_id++;
+    const auto cb_tmp2 = intermed_id++;
+
+    const auto cb_val = cb_tmp0;     // f(x)
+    const auto cb_cal = cb_tmp1;     // calculate f(x) over dimension
+    const auto cb_reduce = cb_tmp2;  // reduce f(x)
+
+    constexpr uint32_t onetile = 1;
+    constexpr uint32_t dst0 = 0;
+    constexpr uint32_t dst1 = 1;
+
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+
+    cb_wait_front(cb_one, onetile);  // comes from the reader
+
+    constexpr uint32_t TILE_H = 32;
+    const bool do_mask_h = (origin_h % TILE_H) != 0;
+    const auto mask_h = do_mask_h ? (origin_h % TILE_H) : TILE_H;
+
+    if (do_mask_h) {
+        cb_wait_front(cb_mask_h, onetile);  // comes from the reader
+    }
+    for (uint32_t col_idx = 0; col_idx < num_cols_per_core; ++col_idx) {
+        for (uint32_t row_idx = 0; row_idx < Ht; ++row_idx) {
+            // f(x)
+            tile_regs_acquire();
+            cb_wait_front(cb_x, onetile);  // comes from the reader
+            cb_reserve_back(cb_val, onetile);
+
+            copy_tile_init_with_dt(cb_x);
+            copy_tile(cb_x, 0, dst0);
+
+            if (do_mask_h && (row_idx == Ht - 1)) {
+                copy_tile_init_with_dt(cb_mask_h);
+                copy_tile(cb_mask_h, 0, dst1);
+
+                mask_tile_init();
+#ifdef MINUS_INF
+                mask_posinf_tile(dst0, dst1);
+#else
+                mask_tile(dst0, dst1);
+#endif
+            }
+#ifdef IS_ZERO
+            unary_ne_tile_init();
+            unary_ne_tile(dst0, 0);
+#else
+            abs_tile_init();
+            abs_tile(dst0);
+#endif
+
+#ifdef MINUS_INF
+            negative_tile_init();
+            negative_tile(dst0);
+#endif
+            tile_regs_commit();
+
+            tile_regs_wait();
+            pack_tile_with_dt(dst0, cb_val);
+            tile_regs_release();
+
+            cb_pop_front(cb_x, onetile);
+            cb_push_back(cb_val, onetile);
+
+            // calculate f(x) over dimension
+            if (row_idx == 0) {
+                tile_regs_acquire();
+                cb_wait_front(cb_val, onetile);
+                cb_reserve_back(cb_cal, onetile);
+
+                copy_tile_init_with_dt(cb_val);
+                copy_tile(cb_val, 0, dst0);
+                tile_regs_commit();
+
+                tile_regs_wait();
+                pack_tile_with_dt(dst0, cb_cal);
+                tile_regs_release();
+
+                cb_pop_front(cb_val, onetile);
+                cb_push_back(cb_cal, onetile);
+
+            } else {
+                tile_regs_acquire();
+                cb_wait_front(cb_val, onetile);
+                cb_wait_front(cb_cal, onetile);
+                cb_reserve_back(cb_cal, onetile);
+#ifdef IS_ZERO
+                add_tiles_init_with_dt(cb_val, cb_cal);
+                add_tiles(cb_val, cb_cal, 0, 0, dst0);
+#else
+                copy_tile_init_with_dt(cb_val);
+                copy_tile(cb_val, 0, dst0);
+
+                copy_tile_init_with_dt(cb_cal);
+                copy_tile(cb_cal, 0, dst1);
+
+                max_tile_init();
+                max_tile(dst0, dst1);
+#endif
+                tile_regs_commit();
+
+                tile_regs_wait();
+                pack_tile_with_dt(dst0, cb_cal);
+                tile_regs_release();
+
+                cb_pop_front(cb_val, onetile);
+                cb_pop_front(cb_cal, onetile);
+                cb_push_back(cb_cal, onetile);
+            }
+        }
+        // reduce f(x)
+
+        tile_regs_acquire();
+        cb_wait_front(cb_cal, onetile);
+        cb_reserve_back(cb_reduce, onetile);
+
+        reduce_init_delta_with_dt<false, REDUCE_OP, REDUCE_DIM>(cb_reduce, cb_cal, cb_one);
+        reduce_tile(cb_cal, cb_one, 0, 0, dst0);
+        reduce_revert_delta(cb_reduce);
+        tile_regs_commit();
+
+        tile_regs_wait();
+        pack_tile_with_dt(dst0, cb_reduce);
+        tile_regs_release();
+
+        cb_pop_front(cb_cal, onetile);
+        cb_push_back(cb_reduce, onetile);
+
+        tile_regs_acquire();
+
+        cb_wait_front(cb_reduce, onetile);
+        cb_reserve_back(cb_y, onetile);
+
+        copy_tile_init_with_dt(cb_reduce);
+        copy_tile(cb_reduce, 0, dst0);
+#ifdef MINUS_INF
+        negative_tile_init();
+        negative_tile(dst0);
+#endif
+        tile_regs_commit();
+
+        tile_regs_wait();
+        pack_tile_with_dt(dst0, cb_y);
+        tile_regs_release();
+
+        cb_pop_front(cb_reduce, onetile);
+        cb_push_back(cb_y, onetile);
+    }
+
+    cb_pop_front(cb_one, onetile);
+    if (do_mask_h) {
+        cb_pop_front(cb_mask_h, onetile);
+    }
+
+}  // void MAIN
+}  // namespace NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/reader_moreh_norm_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/reader_moreh_norm_h.cpp
similarity index 88%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/reader_moreh_norm_h.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/reader_moreh_norm_h.cpp
index af505064b3e..7ec747a04aa 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/reader_moreh_norm_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/reader_moreh_norm_h.cpp
@@ -8,8 +8,6 @@ void kernel_main() {
     int i{0};
     const auto input_addr = get_arg_val<uint32_t>(i++);
     const bool input_is_dram = get_arg_val<uint32_t>(i++) == 1;
-    const auto decimal = get_arg_val<uint32_t>(i++);
-    const auto recip_p_decimal = get_arg_val<uint32_t>(i++);
     const auto num_cols_per_core = get_arg_val<uint32_t>(i++);
     const auto tile_offset = get_arg_val<uint32_t>(i++);
     const auto Ht = get_arg_val<uint32_t>(i++);
@@ -19,8 +17,6 @@ void kernel_main() {
     uint32_t cb_id{0};
     const auto cb_id_input = cb_id++;
     const auto cb_id_one = cb_id++;
-    const auto cb_id_decimal = cb_id++;
-    const auto cb_id_recip_p_decimal = cb_id++;
     const auto cb_id_mask_h = cb_id++;
 
     const uint32_t input_tile_bytes = get_tile_size(cb_id_input);
@@ -35,8 +31,6 @@ void kernel_main() {
     Scalar one;
     one.f = 1.0f;
     fill_cb_with_value(cb_id_one, one.u);
-    fill_cb_with_value(cb_id_decimal, decimal);
-    fill_cb_with_value(cb_id_recip_p_decimal, recip_p_decimal);
 
     constexpr uint32_t TILE_H = 32;
     const bool do_mask_h = (origin_h % TILE_H) != 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/writer_moreh_norm_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/writer_moreh_norm_h.cpp
similarity index 100%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/writer_moreh_norm_h.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/writer_moreh_norm_h.cpp
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/moreh_norm_nc_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/moreh_norm_nc_kernel.cpp
new file mode 100644
index 00000000000..fc4ad06d691
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/moreh_norm_nc_kernel.cpp
@@ -0,0 +1,135 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+
+namespace NAMESPACE {
+void MAIN {
+    int i{0};
+    const auto num_output_tiles_per_core = get_arg_val<uint32_t>(i++);
+    const auto num_reduced_tiles_along_dim = get_arg_val<uint32_t>(i++);
+
+    std::uint8_t input_id{tt::CB::c_in0};
+    const auto cb_x = input_id++;    // input
+    const auto cb_one = input_id++;  // one
+
+    std::uint8_t output_id{tt::CB::c_out0};
+    const auto cb_y = output_id++;  // output
+
+    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    const auto cb_tmp0 = intermed_id++;
+    const auto cb_tmp1 = intermed_id++;
+
+    const auto cb_val = cb_tmp0;  // f(x)
+    const auto cb_cal = cb_tmp1;  // calculate f(x) over dimensions
+
+    constexpr uint32_t onetile = 1;
+    constexpr uint32_t dst0 = 0;
+    constexpr uint32_t dst1 = 1;
+
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+
+    cb_wait_front(cb_one, onetile);  // comes from the reader
+
+    for (uint32_t outer_idx = 0; outer_idx < num_output_tiles_per_core; ++outer_idx) {
+        for (uint32_t inner_idx = 0; inner_idx < num_reduced_tiles_along_dim; ++inner_idx) {
+            // x != 0
+            tile_regs_acquire();
+            cb_wait_front(cb_x, onetile);  // comes from the reader
+            cb_reserve_back(cb_val, onetile);
+
+            copy_tile_init_with_dt(cb_x);
+            copy_tile(cb_x, 0, dst0);
+#ifdef IS_ZERO
+            unary_ne_tile_init();
+            unary_ne_tile(dst0, 0);
+#else
+            abs_tile_init();
+            abs_tile(dst0);
+#endif
+
+#ifdef MINUS_INF
+            negative_tile_init();
+            negative_tile(dst0);
+#endif
+            tile_regs_commit();
+
+            tile_regs_wait();
+            pack_tile_with_dt(dst0, cb_val);
+            tile_regs_release();
+
+            cb_pop_front(cb_x, onetile);
+            cb_push_back(cb_val, onetile);
+
+            // Add(x != 0)
+            if (inner_idx == 0) {
+                tile_regs_acquire();
+                cb_wait_front(cb_val, onetile);
+                cb_reserve_back(cb_cal, onetile);
+
+                copy_tile_init_with_dt(cb_val);
+                copy_tile(cb_val, 0, dst0);
+                tile_regs_commit();
+
+                tile_regs_wait();
+                pack_tile_with_dt(dst0, cb_cal);
+                tile_regs_release();
+
+                cb_pop_front(cb_val, onetile);
+                cb_push_back(cb_cal, onetile);
+
+            } else {
+                tile_regs_acquire();
+                cb_wait_front(cb_val, onetile);
+                cb_wait_front(cb_cal, onetile);
+                cb_reserve_back(cb_cal, onetile);
+#ifdef IS_ZERO
+                add_tiles_init_with_dt(cb_val, cb_cal);
+                add_tiles(cb_val, cb_cal, 0, 0, dst0);
+#else
+                copy_tile_init_with_dt(cb_val);
+                copy_tile(cb_val, 0, dst0);
+
+                copy_tile_init_with_dt(cb_cal);
+                copy_tile(cb_cal, 0, dst1);
+
+                max_tile_init();
+                max_tile(dst0, dst1);
+#endif
+                tile_regs_commit();
+
+                tile_regs_wait();
+                pack_tile_with_dt(dst0, cb_cal);
+                tile_regs_release();
+
+                cb_pop_front(cb_val, onetile);
+                cb_pop_front(cb_cal, onetile);
+                cb_push_back(cb_cal, onetile);
+            }
+        }
+
+        // Compute cb_y
+        tile_regs_acquire();
+
+        cb_wait_front(cb_cal, onetile);
+        cb_reserve_back(cb_y, onetile);
+
+        copy_tile_init_with_dt(cb_cal);
+        copy_tile(cb_cal, 0, dst0);
+#ifdef MINUS_INF
+        negative_tile_init();
+        negative_tile(dst0);
+#endif
+        tile_regs_commit();
+
+        tile_regs_wait();
+        pack_tile_with_dt(dst0, cb_y);
+        tile_regs_release();
+
+        cb_pop_front(cb_cal, onetile);
+        cb_push_back(cb_y, onetile);
+    }
+    cb_pop_front(cb_one, onetile);
+
+}  // void MAIN
+}  // namespace NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/reader_moreh_norm_other.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/reader_moreh_norm_nc.cpp
similarity index 87%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/reader_moreh_norm_other.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/reader_moreh_norm_nc.cpp
index f4d331b595c..d395c4ad46f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/reader_moreh_norm_other.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/reader_moreh_norm_nc.cpp
@@ -8,8 +8,6 @@ void kernel_main() {
     int i{0};
     const auto input_addr = get_arg_val<uint32_t>(i++);
     const bool input_is_dram = get_arg_val<uint32_t>(i++) == 1;
-    const auto decimal = get_arg_val<uint32_t>(i++);
-    const auto recip_p_decimal = get_arg_val<uint32_t>(i++);
     const auto num_output_tiles_per_core = get_arg_val<uint32_t>(i++);
     const auto tile_offset = get_arg_val<uint32_t>(i++);
     const auto outer_stride = get_arg_val<uint32_t>(i++);
@@ -19,8 +17,6 @@ void kernel_main() {
     uint32_t cb_id{0};
     const auto cb_id_input = cb_id++;
     const auto cb_id_one = cb_id++;
-    const auto cb_id_decimal = cb_id++;
-    const auto cb_id_recip_p_decimal = cb_id++;
 
     const uint32_t input_tile_bytes = get_tile_size(cb_id_input);
     const auto input_data_format = get_dataformat(cb_id_input);
@@ -34,8 +30,6 @@ void kernel_main() {
     Scalar one;
     one.f = 1.0f;
     fill_cb_with_value(cb_id_one, one.u);
-    fill_cb_with_value(cb_id_decimal, decimal);
-    fill_cb_with_value(cb_id_recip_p_decimal, recip_p_decimal);
 
     const auto input_l1_write_ptr = get_write_ptr(cb_id_input);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/writer_moreh_norm_other.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/writer_moreh_norm_nc.cpp
similarity index 100%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/writer_moreh_norm_other.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/writer_moreh_norm_nc.cpp
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_h_other.cpp
similarity index 73%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_h.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_h_other.cpp
index 539dc87e425..0ea602953d2 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_h_other.cpp
@@ -2,13 +2,13 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "moreh_norm_device_operation.hpp"
 #include "tt_metal/common/work_split.hpp"
+#include "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm {
 
-MorehNormOperation::ProgramFactoryH::cached_program_t MorehNormOperation::ProgramFactoryH::create(
+MorehNormOperation::ProgramFactoryHOther::cached_program_t MorehNormOperation::ProgramFactoryHOther::create(
     const operation_attributes_t& operation_attributes,
     const tensor_args_t& tensor_args,
     tensor_return_value_t& output) {
@@ -36,10 +36,6 @@ MorehNormOperation::ProgramFactoryH::cached_program_t MorehNormOperation::Progra
 
     const auto origin_h = input_shape.without_padding()[-2];
 
-    auto [floored_p, decimal, p_is_negative] = get_floored_p_and_decimal_and_p_is_negative(p);
-    auto [floored_recip_p, recip_p_decimal, recip_p_is_negative] =
-        get_floored_p_and_decimal_and_p_is_negative(1.0f / p);
-
     ////////////////////////////////////////////////////////////////////////////
     //                         Core Setup
     ////////////////////////////////////////////////////////////////////////////
@@ -66,48 +62,36 @@ MorehNormOperation::ProgramFactoryH::cached_program_t MorehNormOperation::Progra
 
     const uint32_t in0_t{1};  // input
     const uint32_t in1_t{1};  // one
-    const uint32_t in2_t{1};  // decimal
-    const uint32_t in3_t{1};  // recip_p_decimal
-    const uint32_t in4_t{1};  // mask_h
+    const uint32_t in2_t{1};  // mask_h
 
     const uint32_t out0_t{1};  // output
 
-    const uint32_t im0_t{1};  // |x|
-    const uint32_t im1_t{1};  // log(|x|)
-    const uint32_t im2_t{1};  // exp(log(|x|) * decimal)
-    const uint32_t im3_t{1};  // |x|^p
-    const uint32_t im4_t{1};  // |x|^p * exp(log(|x|) * decimal) == |x + decimal|^p
-    const uint32_t im5_t{1};  // Add(|x + decimal|^p)
-    const uint32_t im6_t{1};  // Sum(|x + decimal|^p)
+    const uint32_t im0_t{1};  // f(x)
+    const uint32_t im1_t{1};  // calculate f(x) over dimension
+    const uint32_t im2_t{1};  // reduce f(x)
 
     CreateCircularBuffer(
         program,
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},    // input
-            {tt::CB::c_in1, in1_t},    // one
-            {tt::CB::c_in2, in2_t},    // decimal
-            {tt::CB::c_in3, in3_t},    // recip_p_decimal
-            {tt::CB::c_in4, in4_t},    // mask_h
-            {tt::CB::c_out0, out0_t},  // output
-            {tt::CB::c_intermed0, im0_t, intermed_data_format},
-            {tt::CB::c_intermed1, im1_t, intermed_data_format},
-            {tt::CB::c_intermed2, im2_t, intermed_data_format},
-            {tt::CB::c_intermed3, im3_t, intermed_data_format},
-            {tt::CB::c_intermed4, im4_t, intermed_data_format},
-            {tt::CB::c_intermed5, im5_t, intermed_data_format},
-            {tt::CB::c_intermed6, im6_t, intermed_data_format},
+            {tt::CBIndex::c_0, in0_t},    // input
+            {tt::CBIndex::c_1, in1_t},    // one
+            {tt::CBIndex::c_2, in2_t},    // mask_h
+            {tt::CBIndex::c_16, out0_t},  // output
+            {tt::CBIndex::c_24, im0_t, intermed_data_format},
+            {tt::CBIndex::c_25, im1_t, intermed_data_format},
+            {tt::CBIndex::c_26, im2_t, intermed_data_format},
         });
 
     ////////////////////////////////////////////////////////////////////////////
     //                      DataMovementKernel SetUp
     ////////////////////////////////////////////////////////////////////////////
     const auto reader_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/"
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/"
         "reader_moreh_norm_h.cpp";
     const auto writer_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/"
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/"
         "writer_moreh_norm_h.cpp";
 
     const auto reader_kernels_id = CreateReadKernel(program, reader_kernel_file, all_cores);
@@ -117,11 +101,18 @@ MorehNormOperation::ProgramFactoryH::cached_program_t MorehNormOperation::Progra
     //                      ComputeKernel SetUp
     ////////////////////////////////////////////////////////////////////////////
     std::map<std::string, std::string> compute_defines{};
-    compute_defines["REDUCE_OP"] = "PoolType::SUM";
     compute_defines["REDUCE_DIM"] = "ReduceDim::REDUCE_COL";
+    if (p == 0.0) {
+        compute_defines["REDUCE_OP"] = "PoolType::SUM";
+        compute_defines["IS_ZERO"] = "1";
+    } else {
+        compute_defines["REDUCE_OP"] = "PoolType::MAX";
+        if (p == -std::numeric_limits<float>::infinity())
+            compute_defines["MINUS_INF"] = "1";
+    }
 
     const auto compute_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_h/kernels/"
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_h/kernels/"
         "moreh_norm_h_kernel.cpp";
 
     const auto compute_kernels_id_1 = CreateComputeKernel(
@@ -162,13 +153,10 @@ MorehNormOperation::ProgramFactoryH::cached_program_t MorehNormOperation::Progra
         } else {
             TT_THROW("Core not in specified core ranges.");
         }
-
         // reader
         const std::vector<uint32_t> reader_runtime_args{
             input.buffer()->address(),
             static_cast<uint32_t>(is_dram(input)),
-            *reinterpret_cast<uint32_t*>(&decimal),
-            *reinterpret_cast<uint32_t*>(&recip_p_decimal),
             num_cols_per_core,
             tile_offset,
             Ht,
@@ -178,21 +166,11 @@ MorehNormOperation::ProgramFactoryH::cached_program_t MorehNormOperation::Progra
 
         // writer
         const std::vector<uint32_t> writer_runtime_args{
-            output.buffer()->address(),
-            static_cast<uint32_t>(is_dram(output)),
-            num_cols_per_core,
-            tile_offset};
+            output.buffer()->address(), static_cast<uint32_t>(is_dram(output)), num_cols_per_core, tile_offset};
         SetRuntimeArgs(program, writer_kernels_id, core, writer_runtime_args);
 
         // compute
-        const std::vector<uint32_t> compute_runtime_args{
-            num_cols_per_core,
-            Ht,
-            origin_h,
-            floored_p,
-            static_cast<uint32_t>(p_is_negative),
-            floored_recip_p,
-            static_cast<uint32_t>(recip_p_is_negative)};
+        const std::vector<uint32_t> compute_runtime_args{num_cols_per_core, Ht, origin_h};
         SetRuntimeArgs(program, compute_kernel_id, core, compute_runtime_args);
 
         tile_offset += num_cols_per_core;
@@ -201,7 +179,7 @@ MorehNormOperation::ProgramFactoryH::cached_program_t MorehNormOperation::Progra
     return {std::move(program), {reader_kernels_id, writer_kernels_id, num_cores_to_be_used, num_cores_y}};
 }
 
-void MorehNormOperation::ProgramFactoryH::override_runtime_arguments(
+void MorehNormOperation::ProgramFactoryHOther::override_runtime_arguments(
     cached_program_t& cached_program,
     const operation_attributes_t& operation_attributes,
     const tensor_args_t& tensor_args,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_other.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_nc_other.cpp
similarity index 76%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_other.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_nc_other.cpp
index 2dc4a3a71d5..67db142f326 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_other.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_nc_other.cpp
@@ -2,19 +2,19 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "moreh_norm_device_operation.hpp"
 #include "tt_metal/common/work_split.hpp"
+#include "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm {
 
-MorehNormOperation::ProgramFactoryOther::cached_program_t MorehNormOperation::ProgramFactoryOther::create(
+MorehNormOperation::ProgramFactoryNCOther::cached_program_t MorehNormOperation::ProgramFactoryNCOther::create(
     const operation_attributes_t& operation_attributes,
     const tensor_args_t& tensor_args,
     tensor_return_value_t& output) {
     const auto& input = tensor_args.input;
-    const auto p = operation_attributes.p;
     const auto dim = operation_attributes.dim;
+    const auto p = operation_attributes.p;
     ////////////////////////////////////////////////////////////////////////////
     //                      Device Setup
     ////////////////////////////////////////////////////////////////////////////
@@ -48,10 +48,6 @@ MorehNormOperation::ProgramFactoryOther::cached_program_t MorehNormOperation::Pr
     }
     num_inner_tiles /= tt::constants::TILE_HW;
 
-    auto [floored_p, decimal, p_is_negative] = get_floored_p_and_decimal_and_p_is_negative(p);
-    auto [floored_recip_p, recip_p_decimal, recip_p_is_negative] =
-        get_floored_p_and_decimal_and_p_is_negative(1.0f / p);
-
     ////////////////////////////////////////////////////////////////////////////
     //                         Core Setup
     ////////////////////////////////////////////////////////////////////////////
@@ -78,45 +74,33 @@ MorehNormOperation::ProgramFactoryOther::cached_program_t MorehNormOperation::Pr
 
     const uint32_t in0_t{1};  // input
     const uint32_t in1_t{1};  // one
-    const uint32_t in2_t{1};  // decimal
-    const uint32_t in3_t{1};  // recip_p_decimal
 
     const uint32_t out0_t{1};  // output
 
-    const uint32_t im0_t{1};  // |x|
-    const uint32_t im1_t{1};  // log(|x|)
-    const uint32_t im2_t{1};  // exp(log(|x|) * decimal)
-    const uint32_t im3_t{1};  // |x|^p
-    const uint32_t im4_t{1};  // |x|^p * exp(log(|x|) * decimal) == |x + decimal|^p
-    const uint32_t im5_t{1};  // Add(|x + decimal|^p)
+    const uint32_t im0_t{1};  // f(x)
+    const uint32_t im1_t{1};  // calculate f(x) over dimensions
 
     CreateCircularBuffer(
         program,
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},    // input
-            {tt::CB::c_in1, in1_t},    // one
-            {tt::CB::c_in2, in2_t},    // decimal
-            {tt::CB::c_in3, in3_t},    // recip_p_decimal
-            {tt::CB::c_out0, out0_t},  // output
-            {tt::CB::c_intermed0, im0_t, intermed_data_format},
-            {tt::CB::c_intermed1, im1_t, intermed_data_format},
-            {tt::CB::c_intermed2, im2_t, intermed_data_format},
-            {tt::CB::c_intermed3, im3_t, intermed_data_format},
-            {tt::CB::c_intermed4, im4_t, intermed_data_format},
-            {tt::CB::c_intermed5, im5_t, intermed_data_format},
+            {tt::CBIndex::c_0, in0_t},    // input
+            {tt::CBIndex::c_1, in1_t},    // one
+            {tt::CBIndex::c_16, out0_t},  // output
+            {tt::CBIndex::c_24, im0_t, intermed_data_format},
+            {tt::CBIndex::c_25, im1_t, intermed_data_format},
         });
 
     ////////////////////////////////////////////////////////////////////////////
     //                      DataMovementKernel SetUp
     ////////////////////////////////////////////////////////////////////////////
     const auto reader_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/"
-        "reader_moreh_norm_other.cpp";
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/"
+        "reader_moreh_norm_nc.cpp";
     const auto writer_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/"
-        "writer_moreh_norm_other.cpp";
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/"
+        "writer_moreh_norm_nc.cpp";
 
     const auto reader_kernels_id = CreateReadKernel(program, reader_kernel_file, all_cores);
     const auto writer_kernels_id = CreateWriteKernel(program, writer_kernel_file, all_cores);
@@ -125,10 +109,16 @@ MorehNormOperation::ProgramFactoryOther::cached_program_t MorehNormOperation::Pr
     //                      ComputeKernel SetUp
     ////////////////////////////////////////////////////////////////////////////
     std::map<std::string, std::string> compute_defines{};
+    if (p == 0.0) {
+        compute_defines["IS_ZERO"] = "1";
+    } else {
+        if (p == -std::numeric_limits<float>::infinity())
+            compute_defines["MINUS_INF"] = "1";
+    }
 
     const auto compute_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_other/kernels/"
-        "moreh_norm_other_kernel.cpp";
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_nc/kernels/"
+        "moreh_norm_nc_kernel.cpp";
 
     const auto compute_kernels_id_1 = CreateComputeKernel(
         program,
@@ -168,13 +158,10 @@ MorehNormOperation::ProgramFactoryOther::cached_program_t MorehNormOperation::Pr
         } else {
             TT_THROW("Core not in specified core ranges.");
         }
-
         // reader
         const std::vector<uint32_t> reader_runtime_args{
             input.buffer()->address(),
             static_cast<uint32_t>(is_dram(input)),
-            *reinterpret_cast<uint32_t*>(&decimal),
-            *reinterpret_cast<uint32_t*>(&recip_p_decimal),
             num_output_tiles_per_core,
             tile_offset,
             outer_stride,
@@ -184,20 +171,14 @@ MorehNormOperation::ProgramFactoryOther::cached_program_t MorehNormOperation::Pr
 
         // writer
         const std::vector<uint32_t> writer_runtime_args{
-            output.buffer()->address(),
-            static_cast<uint32_t>(is_dram(output)),
-            num_output_tiles_per_core,
-            tile_offset};
+            output.buffer()->address(), static_cast<uint32_t>(is_dram(output)), num_output_tiles_per_core, tile_offset};
         SetRuntimeArgs(program, writer_kernels_id, core, writer_runtime_args);
 
         // compute
         const std::vector<uint32_t> compute_runtime_args{
             num_output_tiles_per_core,
             num_reduced_tiles_along_dim,
-            floored_p,
-            static_cast<uint32_t>(p_is_negative),
-            floored_recip_p,
-            static_cast<uint32_t>(recip_p_is_negative)};
+        };
         SetRuntimeArgs(program, compute_kernel_id, core, compute_runtime_args);
 
         tile_offset += num_output_tiles_per_core;
@@ -206,7 +187,7 @@ MorehNormOperation::ProgramFactoryOther::cached_program_t MorehNormOperation::Pr
     return {std::move(program), {reader_kernels_id, writer_kernels_id, num_cores_to_be_used, num_cores_y}};
 }
 
-void MorehNormOperation::ProgramFactoryOther::override_runtime_arguments(
+void MorehNormOperation::ProgramFactoryNCOther::override_runtime_arguments(
     cached_program_t& cached_program,
     const operation_attributes_t& operation_attributes,
     const tensor_args_t& tensor_args,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_w_other.cpp
similarity index 74%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_w.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_w_other.cpp
index 460af4e7801..27047884d93 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_program_factory_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_program_factory_w_other.cpp
@@ -2,13 +2,15 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "moreh_norm_device_operation.hpp"
+#include <limits>
+
 #include "tt_metal/common/work_split.hpp"
+#include "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_device_operation.hpp"
 #include "ttnn/operations/moreh/moreh_helper_functions.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm {
 
-MorehNormOperation::ProgramFactoryW::cached_program_t MorehNormOperation::ProgramFactoryW::create(
+MorehNormOperation::ProgramFactoryWOther::cached_program_t MorehNormOperation::ProgramFactoryWOther::create(
     const operation_attributes_t& operation_attributes,
     const tensor_args_t& tensor_args,
     tensor_return_value_t& output) {
@@ -36,10 +38,6 @@ MorehNormOperation::ProgramFactoryW::cached_program_t MorehNormOperation::Progra
 
     const auto origin_w = input_shape.without_padding()[input_rank - 1];
 
-    auto [floored_p, decimal, p_is_negative] = get_floored_p_and_decimal_and_p_is_negative(p);
-    auto [floored_recip_p, recip_p_decimal, recip_p_is_negative] =
-        get_floored_p_and_decimal_and_p_is_negative(1.0f / p);
-
     ////////////////////////////////////////////////////////////////////////////
     //                         Core Setup
     ////////////////////////////////////////////////////////////////////////////
@@ -66,48 +64,36 @@ MorehNormOperation::ProgramFactoryW::cached_program_t MorehNormOperation::Progra
 
     const uint32_t in0_t{1};  // input
     const uint32_t in1_t{1};  // one
-    const uint32_t in2_t{1};  // decimal
-    const uint32_t in3_t{1};  // recip_p_decimal
-    const uint32_t in4_t{1};  // mask_w
+    const uint32_t in2_t{1};  // mask_w
 
     const uint32_t out0_t{1};  // output
 
-    const uint32_t im0_t{1};  // |x|
-    const uint32_t im1_t{1};  // log(|x|)
-    const uint32_t im2_t{1};  // exp(log(|x|) * decimal)
-    const uint32_t im3_t{1};  // |x|^p
-    const uint32_t im4_t{1};  // |x|^p * exp(log(|x|) * decimal) == |x + decimal|^p
-    const uint32_t im5_t{1};  // Add(|x + decimal|^p)
-    const uint32_t im6_t{1};  // Sum(|x + decimal|^p)
+    const uint32_t im0_t{1};  // f(x)
+    const uint32_t im1_t{1};  // calculate f(x) over dimension
+    const uint32_t im2_t{1};  // reduce f(x)
 
     CreateCircularBuffer(
         program,
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},    // input
-            {tt::CB::c_in1, in1_t},    // one
-            {tt::CB::c_in2, in2_t},    // decimal
-            {tt::CB::c_in3, in3_t},    // recip_p_decimal
-            {tt::CB::c_in4, in4_t},    // mask_w
-            {tt::CB::c_out0, out0_t},  // output
-            {tt::CB::c_intermed0, im0_t, intermed_data_format},
-            {tt::CB::c_intermed1, im1_t, intermed_data_format},
-            {tt::CB::c_intermed2, im2_t, intermed_data_format},
-            {tt::CB::c_intermed3, im3_t, intermed_data_format},
-            {tt::CB::c_intermed4, im4_t, intermed_data_format},
-            {tt::CB::c_intermed5, im5_t, intermed_data_format},
-            {tt::CB::c_intermed6, im6_t, intermed_data_format},
+            {tt::CBIndex::c_0, in0_t},    // input
+            {tt::CBIndex::c_1, in1_t},    // one
+            {tt::CBIndex::c_2, in2_t},    // mask_w
+            {tt::CBIndex::c_16, out0_t},  // output
+            {tt::CBIndex::c_24, im0_t, intermed_data_format},
+            {tt::CBIndex::c_25, im1_t, intermed_data_format},
+            {tt::CBIndex::c_26, im2_t, intermed_data_format},
         });
 
     ////////////////////////////////////////////////////////////////////////////
     //                      DataMovementKernel SetUp
     ////////////////////////////////////////////////////////////////////////////
     const auto reader_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/"
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/"
         "reader_moreh_norm_w.cpp";
     const auto writer_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/"
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/"
         "writer_moreh_norm_w.cpp";
 
     const auto reader_kernels_id = CreateReadKernel(program, reader_kernel_file, all_cores);
@@ -117,11 +103,19 @@ MorehNormOperation::ProgramFactoryW::cached_program_t MorehNormOperation::Progra
     //                      ComputeKernel SetUp
     ////////////////////////////////////////////////////////////////////////////
     std::map<std::string, std::string> compute_defines{};
-    compute_defines["REDUCE_OP"] = "PoolType::SUM";
+
     compute_defines["REDUCE_DIM"] = "ReduceDim::REDUCE_ROW";
+    if (p == 0.0) {
+        compute_defines["REDUCE_OP"] = "PoolType::SUM";
+        compute_defines["IS_ZERO"] = "1";
+    } else {
+        compute_defines["REDUCE_OP"] = "PoolType::MAX";
+        if (p == -std::numeric_limits<float>::infinity())
+            compute_defines["MINUS_INF"] = "1";
+    }
 
     const auto compute_kernel_file =
-        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/"
+        "ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/"
         "moreh_norm_w_kernel.cpp";
 
     const auto compute_kernels_id_1 = CreateComputeKernel(
@@ -162,13 +156,10 @@ MorehNormOperation::ProgramFactoryW::cached_program_t MorehNormOperation::Progra
         } else {
             TT_THROW("Core not in specified core ranges.");
         }
-
         // reader
         const std::vector<uint32_t> reader_runtime_args{
             input.buffer()->address(),
             static_cast<uint32_t>(is_dram(input)),
-            *reinterpret_cast<uint32_t*>(&decimal),
-            *reinterpret_cast<uint32_t*>(&recip_p_decimal),
             num_units_per_core,
             Wt,
             tile_offset,
@@ -177,11 +168,7 @@ MorehNormOperation::ProgramFactoryW::cached_program_t MorehNormOperation::Progra
 
         // writer
         const std::vector<uint32_t> writer_runtime_args{
-            output.buffer()->address(),
-            static_cast<uint32_t>(is_dram(output)),
-            num_units_per_core,
-            Wt,
-            tile_offset};
+            output.buffer()->address(), static_cast<uint32_t>(is_dram(output)), num_units_per_core, Wt, tile_offset};
         SetRuntimeArgs(program, writer_kernels_id, core, writer_runtime_args);
 
         // compute
@@ -189,10 +176,7 @@ MorehNormOperation::ProgramFactoryW::cached_program_t MorehNormOperation::Progra
             num_units_per_core,
             Wt,
             origin_w,
-            floored_p,
-            static_cast<uint32_t>(p_is_negative),
-            floored_recip_p,
-            static_cast<uint32_t>(recip_p_is_negative)};
+        };
         SetRuntimeArgs(program, compute_kernel_id, core, compute_runtime_args);
 
         tile_offset += num_units_per_core * Wt;
@@ -201,7 +185,7 @@ MorehNormOperation::ProgramFactoryW::cached_program_t MorehNormOperation::Progra
     return {std::move(program), {reader_kernels_id, writer_kernels_id, num_cores_to_be_used, num_cores_y}};
 }
 
-void MorehNormOperation::ProgramFactoryW::override_runtime_arguments(
+void MorehNormOperation::ProgramFactoryWOther::override_runtime_arguments(
     cached_program_t& cached_program,
     const operation_attributes_t& operation_attributes,
     const tensor_args_t& tensor_args,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
new file mode 100644
index 00000000000..a0825fa33f2
--- /dev/null
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/moreh_norm_w_kernel.cpp
@@ -0,0 +1,176 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+
+namespace NAMESPACE {
+void MAIN {
+    int i{0};
+    const auto num_rows_per_core = get_arg_val<uint32_t>(i++);
+    const auto Wt = get_arg_val<uint32_t>(i++);
+    const auto origin_w = get_arg_val<uint32_t>(i++);
+
+    std::uint8_t input_id{tt::CB::c_in0};
+    const auto cb_x = input_id++;       // input
+    const auto cb_one = input_id++;     // one
+    const auto cb_mask_w = input_id++;  // mask_w
+
+    std::uint8_t output_id{tt::CB::c_out0};
+    const auto cb_y = output_id++;  // output
+
+    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    const auto cb_tmp0 = intermed_id++;
+    const auto cb_tmp1 = intermed_id++;
+    const auto cb_tmp2 = intermed_id++;
+
+    const auto cb_val = cb_tmp0;     // f(x)
+    const auto cb_cal = cb_tmp1;     // calculate f(x) over dimension
+    const auto cb_reduce = cb_tmp2;  // reduce f(x)
+
+    constexpr uint32_t onetile = 1;
+    constexpr uint32_t dst0 = 0;
+    constexpr uint32_t dst1 = 1;
+
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+
+    cb_wait_front(cb_one, onetile);  // comes from the reader
+
+    constexpr uint32_t TILE_W = 32;
+    const bool do_mask_w = (origin_w % TILE_W) != 0;
+    const auto mask_w = do_mask_w ? (origin_w % TILE_W) : TILE_W;
+
+    if (do_mask_w) {
+        cb_wait_front(cb_mask_w, onetile);  // comes from the reader
+    }
+
+    for (uint32_t row_idx = 0; row_idx < num_rows_per_core; ++row_idx) {
+        for (uint32_t col_idx = 0; col_idx < Wt; ++col_idx) {
+            // f(x)
+            tile_regs_acquire();
+            cb_wait_front(cb_x, onetile);  // comes from the reader
+            cb_reserve_back(cb_val, onetile);
+
+            copy_tile_init_with_dt(cb_x);
+            copy_tile(cb_x, 0, dst0);
+
+            if (do_mask_w && (col_idx == Wt - 1)) {
+                copy_tile_init_with_dt(cb_mask_w);
+                copy_tile(cb_mask_w, 0, dst1);
+                mask_tile_init();
+#ifdef MINUS_INF
+                mask_posinf_tile(dst0, dst1);
+#else
+                mask_tile(dst0, dst1);
+#endif
+            }
+#ifdef IS_ZERO
+            unary_ne_tile_init();
+            unary_ne_tile(dst0, 0);
+#else
+            abs_tile_init();
+            abs_tile(dst0);
+#endif
+
+#ifdef MINUS_INF
+            negative_tile_init();
+            negative_tile(dst0);
+#endif
+            tile_regs_commit();
+
+            tile_regs_wait();
+            pack_tile_with_dt(dst0, cb_val);
+            tile_regs_release();
+
+            cb_pop_front(cb_x, onetile);
+            cb_push_back(cb_val, onetile);
+
+            // calculate f(x) over dimension
+            if (col_idx == 0) {
+                tile_regs_acquire();
+                cb_wait_front(cb_val, onetile);
+                cb_reserve_back(cb_cal, onetile);
+
+                copy_tile_init_with_dt(cb_val);
+                copy_tile(cb_val, 0, dst0);
+                tile_regs_commit();
+
+                tile_regs_wait();
+                pack_tile_with_dt(dst0, cb_cal);
+                tile_regs_release();
+
+                cb_pop_front(cb_val, onetile);
+                cb_push_back(cb_cal, onetile);
+            } else {
+                tile_regs_acquire();
+                cb_wait_front(cb_val, onetile);
+                cb_wait_front(cb_cal, onetile);
+                cb_reserve_back(cb_cal, onetile);
+#ifdef IS_ZERO
+                add_tiles_init_with_dt(cb_val, cb_cal);
+                add_tiles(cb_val, cb_cal, 0, 0, dst0);
+#else
+                copy_tile_init_with_dt(cb_val);
+                copy_tile(cb_val, 0, dst0);
+
+                copy_tile_init_with_dt(cb_cal);
+                copy_tile(cb_cal, 0, dst1);
+
+                max_tile_init();
+                max_tile(dst0, dst1);
+#endif
+                tile_regs_commit();
+
+                tile_regs_wait();
+                pack_tile_with_dt(dst0, cb_cal);
+                tile_regs_release();
+
+                cb_pop_front(cb_val, onetile);
+                cb_pop_front(cb_cal, onetile);
+                cb_push_back(cb_cal, onetile);
+            }
+        }
+        // reduce f(x)
+        tile_regs_acquire();
+        cb_wait_front(cb_cal, onetile);
+        cb_reserve_back(cb_reduce, onetile);
+
+        reduce_init_delta_with_dt<false, REDUCE_OP, REDUCE_DIM>(cb_reduce, cb_cal, cb_one);
+        reduce_tile(cb_cal, cb_one, 0, 0, dst0);
+        reduce_revert_delta(cb_reduce);
+        tile_regs_commit();
+
+        tile_regs_wait();
+        pack_tile_with_dt(dst0, cb_reduce);
+        tile_regs_release();
+
+        cb_pop_front(cb_cal, onetile);
+        cb_push_back(cb_reduce, onetile);
+
+        tile_regs_acquire();
+
+        cb_wait_front(cb_reduce, onetile);
+        cb_reserve_back(cb_y, onetile);
+
+        copy_tile_init_with_dt(cb_reduce);
+        copy_tile(cb_reduce, 0, dst0);
+#ifdef MINUS_INF
+        negative_tile_init();
+        negative_tile(dst0);
+#endif
+        tile_regs_commit();
+
+        tile_regs_wait();
+        pack_tile_with_dt(dst0, cb_y);
+        tile_regs_release();
+
+        cb_pop_front(cb_reduce, onetile);
+        cb_push_back(cb_y, onetile);
+    }
+
+    cb_pop_front(cb_one, onetile);
+    if (do_mask_w) {
+        cb_pop_front(cb_mask_w, onetile);
+    }
+
+}  // void MAIN
+}  // namespace NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/reader_moreh_norm_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/reader_moreh_norm_w.cpp
similarity index 87%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/reader_moreh_norm_w.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/reader_moreh_norm_w.cpp
index 6c52d933cae..158e7a598b4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/reader_moreh_norm_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/reader_moreh_norm_w.cpp
@@ -8,8 +8,6 @@ void kernel_main() {
     int i{0};
     const auto input_addr = get_arg_val<uint32_t>(i++);
     const bool input_is_dram = get_arg_val<uint32_t>(i++) == 1;
-    const auto decimal = get_arg_val<uint32_t>(i++);
-    const auto recip_p_decimal = get_arg_val<uint32_t>(i++);
     const auto num_rows_per_core = get_arg_val<uint32_t>(i++);
     const auto Wt = get_arg_val<uint32_t>(i++);
     const auto tile_offset = get_arg_val<uint32_t>(i++);
@@ -18,8 +16,6 @@ void kernel_main() {
     uint32_t cb_id{0};
     const auto cb_id_input = cb_id++;
     const auto cb_id_one = cb_id++;
-    const auto cb_id_decimal = cb_id++;
-    const auto cb_id_recip_p_decimal = cb_id++;
     const auto cb_id_mask_w = cb_id++;
 
     const uint32_t input_tile_bytes = get_tile_size(cb_id_input);
@@ -34,8 +30,6 @@ void kernel_main() {
     Scalar one;
     one.f = 1.0f;
     fill_cb_with_value(cb_id_one, one.u);
-    fill_cb_with_value(cb_id_decimal, decimal);
-    fill_cb_with_value(cb_id_recip_p_decimal, recip_p_decimal);
 
     constexpr uint32_t TILE_W = 32;
     const bool do_mask_w = (origin_w % TILE_W) != 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/writer_moreh_norm_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/writer_moreh_norm_w.cpp
similarity index 100%
rename from ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/moreh_norm_w/kernels/writer_moreh_norm_w.cpp
rename to ttnn/cpp/ttnn/operations/moreh/moreh_norm/device/ord_other/moreh_norm_w/kernels/writer_moreh_norm_w.cpp
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp
index 19f40d6dc71..68b6688b77d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm/moreh_norm.cpp
@@ -5,6 +5,8 @@
 #include "moreh_norm.hpp"
 
 #include "device/moreh_norm_device_operation.hpp"
+#include "ttnn/cpp/ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow.hpp"
+#include "ttnn/cpp/ttnn/operations/moreh/moreh_sum/moreh_sum.hpp"
 
 namespace ttnn::operations::moreh::moreh_norm {
 Tensor MorehNorm::invoke(
@@ -20,8 +22,16 @@ Tensor MorehNorm::invoke(
         std::iota(dims.begin(), dims.end(), 0);
         dim = std::make_optional(dims);
     }
-    if (auto single_dim = std::get_if<int64_t>(&dim.value()))
-        return ttnn::prim::moreh_norm(input, p, *single_dim, keepdim, output, memory_config, compute_kernel_config);
+    auto INF = std::numeric_limits<float>::infinity();
+    if (auto single_dim = std::get_if<int64_t>(&dim.value())) {
+        if (p == 0.0 || p == INF || p == -INF) {
+            return ttnn::prim::moreh_norm(input, p, *single_dim, keepdim, output, memory_config, compute_kernel_config);
+        }
+        auto tmp_output = ttnn::moreh_abs_pow(input, p, std::nullopt, memory_config, compute_kernel_config);
+        tmp_output =
+            ttnn::moreh_sum(tmp_output, *single_dim, keepdim, std::nullopt, memory_config, compute_kernel_config);
+        return ttnn::moreh_abs_pow(tmp_output, 1.0f / p, output, memory_config, compute_kernel_config);
+    }
 
     auto dims = std::get<ttnn::SmallVector<int64_t>>(dim.value());
     if (dims.empty()) {
@@ -29,16 +39,35 @@ Tensor MorehNorm::invoke(
         std::iota(all_dims.begin(), all_dims.end(), 0);
         dims = all_dims;
     }
-    if (dims.size() == 1)
-        return ttnn::prim::moreh_norm(input, p, dims[0], keepdim, output, memory_config, compute_kernel_config);
-
+    if (dims.size() == 1) {
+        if (p == 0.0 || p == INF || p == -INF) {
+            return ttnn::prim::moreh_norm(input, p, dims[0], keepdim, output, memory_config, compute_kernel_config);
+        }
+        auto tmp_output = ttnn::moreh_abs_pow(input, p, std::nullopt, memory_config, compute_kernel_config);
+        tmp_output = ttnn::moreh_sum(tmp_output, dims[0], keepdim, std::nullopt, memory_config, compute_kernel_config);
+        return ttnn::moreh_abs_pow(tmp_output, 1.0f / p, output, memory_config, compute_kernel_config);
+    }
     std::sort(dims.begin(), dims.end(), std::greater<int64_t>());
-    auto tmp_output =
-        ttnn::prim::moreh_norm(input, p, dims.front(), keepdim, std::nullopt, memory_config, compute_kernel_config);
-    using idx_t = decltype(dims.size());
-    for (idx_t idx = 1; idx < dims.size() - 1; ++idx)
-        tmp_output = ttnn::prim::moreh_norm(
-            tmp_output, p, dims[idx], keepdim, std::nullopt, memory_config, compute_kernel_config);
-    return ttnn::prim::moreh_norm(tmp_output, p, dims.back(), keepdim, output, memory_config, compute_kernel_config);
+
+    if (p == 0) {
+        auto tmp_output =
+            ttnn::prim::moreh_norm(input, p, dims.front(), keepdim, std::nullopt, memory_config, compute_kernel_config);
+        dims.erase(dims.begin());
+        return ttnn::moreh_sum(tmp_output, dims, keepdim, output, memory_config, compute_kernel_config);
+    } else if (p == INF || p == -INF) {
+        auto tmp_output =
+            ttnn::prim::moreh_norm(input, p, dims.front(), keepdim, std::nullopt, memory_config, compute_kernel_config);
+        using idx_t = decltype(dims.size());
+        for (idx_t idx = 1; idx < dims.size() - 1; ++idx) {
+            tmp_output = ttnn::prim::moreh_norm(
+                tmp_output, p, dims[idx], keepdim, std::nullopt, memory_config, compute_kernel_config);
+        }
+        return ttnn::prim::moreh_norm(
+            tmp_output, p, dims.back(), keepdim, output, memory_config, compute_kernel_config);
+    } else {
+        auto tmp_output = ttnn::moreh_abs_pow(input, p, std::nullopt, memory_config, compute_kernel_config);
+        tmp_output = ttnn::moreh_sum(tmp_output, dims, keepdim, std::nullopt, memory_config, compute_kernel_config);
+        return ttnn::moreh_abs_pow(tmp_output, 1.0f / p, output, memory_config, compute_kernel_config);
+    }
 }
 }  // namespace ttnn::operations::moreh::moreh_norm
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/moreh_norm_backward_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/moreh_norm_backward_kernel.cpp
index e4d2af6549d..2a6f2914df4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/moreh_norm_backward_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/kernels/moreh_norm_backward_kernel.cpp
@@ -19,16 +19,16 @@ void MAIN {
     const auto p_minus_one = get_arg_val<uint32_t>(i++);
     const bool p_minus_one_is_negative = get_arg_val<uint32_t>(i++) == 1;
 
-    std::uint8_t input_id{tt::CB::c_in0};
+    std::uint8_t input_id{tt::CBIndex::c_0};
     const auto cb_x = input_id++;        // input(==x)
     const auto cb_y = input_id++;        // output(==y)
     const auto cb_dy = input_id++;       // output_grad(==dy)
     const auto cb_decimal = input_id++;  // decimal
 
-    std::uint8_t output_id{tt::CB::c_out0};
+    std::uint8_t output_id{tt::CBIndex::c_16};
     const auto cb_dx = output_id++;  // input_grad(==dx)
 
-    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    std::uint8_t intermed_id{tt::CBIndex::c_24};
     const auto cb_tmp0 = intermed_id++;
     const auto cb_tmp1 = intermed_id++;
     const auto cb_tmp2 = intermed_id++;
@@ -48,7 +48,7 @@ void MAIN {
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_0);
     cb_wait_front(cb_decimal, onetile);  // comes from the reader
 
     for (uint32_t idx = 0; idx < num_input_tiles_per_core; ++idx) {
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_program_factory.cpp
index 9dfe8d90704..ff3acb2ed7b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_norm_backward/device/moreh_norm_backward_program_factory.cpp
@@ -140,19 +140,19 @@ MorehNormBackwardOperation::ProgramFactory::cached_program_t MorehNormBackwardOp
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},    // input
-            {tt::CB::c_in1, in1_t},    // output
-            {tt::CB::c_in2, in2_t},    // output_grad
-            {tt::CB::c_in3, in3_t},    // decimal
-            {tt::CB::c_out0, out0_t},  // input_grad
-            {tt::CB::c_intermed0, im0_t, intermed_data_format},
-            {tt::CB::c_intermed1, im1_t, intermed_data_format},
-            {tt::CB::c_intermed2, im2_t, intermed_data_format},
-            {tt::CB::c_intermed3, im3_t, intermed_data_format},
-            {tt::CB::c_intermed4, im4_t, intermed_data_format},
-            {tt::CB::c_intermed5, im5_t, intermed_data_format},
-            {tt::CB::c_intermed6, im6_t, intermed_data_format},
-            {tt::CB::c_intermed7, im7_t, intermed_data_format},
+            {tt::CBIndex::c_0, in0_t},    // input
+            {tt::CBIndex::c_1, in1_t},    // output
+            {tt::CBIndex::c_2, in2_t},    // output_grad
+            {tt::CBIndex::c_3, in3_t},    // decimal
+            {tt::CBIndex::c_16, out0_t},  // input_grad
+            {tt::CBIndex::c_24, im0_t, intermed_data_format},
+            {tt::CBIndex::c_25, im1_t, intermed_data_format},
+            {tt::CBIndex::c_26, im2_t, intermed_data_format},
+            {tt::CBIndex::c_27, im3_t, intermed_data_format},
+            {tt::CBIndex::c_28, im4_t, intermed_data_format},
+            {tt::CBIndex::c_29, im5_t, intermed_data_format},
+            {tt::CBIndex::c_30, im6_t, intermed_data_format},
+            {tt::CBIndex::c_31, im7_t, intermed_data_format},
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_pybind.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_pybind.cpp
index 5d19227e880..baa8f7d5c2d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_pybind.cpp
@@ -4,6 +4,7 @@
 
 #include "moreh_pybind.hpp"
 
+#include "ttnn/operations/moreh/moreh_abs_pow/moreh_abs_pow_pybind.hpp"
 #include "ttnn/operations/moreh/moreh_adam/moreh_adam_pybind.hpp"
 #include "ttnn/operations/moreh/moreh_adamw/moreh_adamw_pybind.hpp"
 #include "ttnn/operations/moreh/moreh_arange/moreh_arange_pybind.hpp"
@@ -38,6 +39,7 @@
 
 namespace ttnn::operations::moreh {
 void bind_moreh_operations(py::module &module) {
+    moreh_abs_pow::bind_moreh_abs_pow_operation(module);
     moreh_adam::bind_moreh_adam_operation(module);
     moreh_adamw::bind_moreh_adamw_operation(module);
     moreh_arange::bind_moreh_arange_operation(module);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/moreh_sgd.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/moreh_sgd.cpp
index 2ad5c12efaa..b368e41e2a5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/moreh_sgd.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/moreh_sgd.cpp
@@ -7,18 +7,18 @@
 namespace NAMESPACE {
 
 void MAIN {
-    constexpr auto cb_param_in = tt::CB::c_in0;
-    constexpr auto cb_grad = tt::CB::c_in1;
-    constexpr auto cb_momentum_in = tt::CB::c_in2;
-
-    constexpr auto cb_param_out = tt::CB::c_out0;
-    constexpr auto cb_momentum_out = tt::CB::c_out1;
-
-    constexpr auto cb_scalar_args = tt::CB::c_intermed0;
-    constexpr auto cb_tmp1 = tt::CB::c_intermed1;
-    constexpr auto cb_tmp2 = tt::CB::c_intermed2;
-    constexpr auto cb_tmp3 = tt::CB::c_intermed3;
-    constexpr auto cb_tmp4 = tt::CB::c_intermed4;
+    constexpr auto cb_param_in = tt::CBIndex::c_0;
+    constexpr auto cb_grad = tt::CBIndex::c_1;
+    constexpr auto cb_momentum_in = tt::CBIndex::c_2;
+
+    constexpr auto cb_param_out = tt::CBIndex::c_16;
+    constexpr auto cb_momentum_out = tt::CBIndex::c_17;
+
+    constexpr auto cb_scalar_args = tt::CBIndex::c_24;
+    constexpr auto cb_tmp1 = tt::CBIndex::c_25;
+    constexpr auto cb_tmp2 = tt::CBIndex::c_26;
+    constexpr auto cb_tmp3 = tt::CBIndex::c_27;
+    constexpr auto cb_tmp4 = tt::CBIndex::c_28;
 
     constexpr uint32_t lr_tile = 0;
     constexpr uint32_t momentum_tile = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/reader_moreh_sgd.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/reader_moreh_sgd.cpp
index 30fcf3320ee..4a909486d4d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/reader_moreh_sgd.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/reader_moreh_sgd.cpp
@@ -18,11 +18,11 @@ void kernel_main() {
     uint32_t weight_decay = get_arg_val<uint32_t>(i); i++;
     uint32_t one = get_arg_val<uint32_t>(i); i++;
 
-    constexpr auto cb_param_in = tt::CB::c_in0;
-    constexpr auto cb_grad = tt::CB::c_in1;
-    constexpr auto cb_momentum_in = tt::CB::c_in2;
+    constexpr auto cb_param_in = tt::CBIndex::c_0;
+    constexpr auto cb_grad = tt::CBIndex::c_1;
+    constexpr auto cb_momentum_in = tt::CBIndex::c_2;
 
-    constexpr auto cb_scalar_args = tt::CB::c_intermed0;
+    constexpr auto cb_scalar_args = tt::CBIndex::c_24;
 
     // ublocks size defined in tiles
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/writer_moreh_sgd.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/writer_moreh_sgd.cpp
index 868613387fa..b7a9b345956 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/writer_moreh_sgd.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/kernels/writer_moreh_sgd.cpp
@@ -12,8 +12,8 @@ void kernel_main() {
     uint32_t num_tiles = get_arg_val<uint32_t>(i); i++;
     uint32_t tile_offset = get_arg_val<uint32_t>(i); i++;
 
-    constexpr auto cb_param_out = tt::CB::c_out0;
-    constexpr auto cb_momentum_out = tt::CB::c_out1;
+    constexpr auto cb_param_out = tt::CBIndex::c_16;
+    constexpr auto cb_momentum_out = tt::CBIndex::c_17;
 
     constexpr uint32_t onetile = 1;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
index bbeb2b56a42..0cb404dee89 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_program_factory.cpp
@@ -68,19 +68,19 @@ MorehSgdOperation::ProgramFactory::cached_program_t MorehSgdOperation::ProgramFa
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 2},   // param_in
-            {tt::CB::c_in1, 2},   // grad
-            {tt::CB::c_in2, 2},   // momentum_in
-            {tt::CB::c_out0, 2},  // param_out
-            {tt::CB::c_out1, 2},  // momentum_out
+            {tt::CBIndex::c_0, 2},   // param_in
+            {tt::CBIndex::c_1, 2},   // grad
+            {tt::CBIndex::c_2, 2},   // momentum_in
+            {tt::CBIndex::c_16, 2},  // param_out
+            {tt::CBIndex::c_17, 2},  // momentum_out
 
-            {tt::CB::c_intermed0,
+            {tt::CBIndex::c_24,
              5,
              intermed_cb_format},  // cb_scalar_args (lr, momentum, dampening, weight_decay, one)
-            {tt::CB::c_intermed1, 1, intermed_cb_format},  //
-            {tt::CB::c_intermed2, 1, intermed_cb_format},  //
-            {tt::CB::c_intermed3, 1, intermed_cb_format},  //
-            {tt::CB::c_intermed4, 1, intermed_cb_format},  //
+            {tt::CBIndex::c_25, 1, intermed_cb_format},  //
+            {tt::CBIndex::c_26, 1, intermed_cb_format},  //
+            {tt::CBIndex::c_27, 1, intermed_cb_format},  //
+            {tt::CBIndex::c_28, 1, intermed_cb_format},  //
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/moreh_sgd.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/moreh_sgd.cpp
index ce0997d1e5a..ea5bf16567c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/moreh_sgd.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sgd/moreh_sgd.cpp
@@ -6,6 +6,8 @@
 
 #include "ttnn/operations/moreh/moreh_sgd/device/moreh_sgd_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::moreh::moreh_sgd {
 std::vector<std::optional<Tensor>> MorehSgd::invoke(
     const Tensor& param_in,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_c_large.cpp
index 45881421e12..85badfc447a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_c_large.cpp
@@ -11,13 +11,13 @@
 
 namespace NAMESPACE {
 void MAIN {
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_exps = tt::CB::c_intermed0;
-    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
-    constexpr auto cb_add = tt::CB::c_intermed2;
-    constexpr auto cb_max = tt::CB::c_intermed3;
-    constexpr auto cb_tmp = tt::CB::c_intermed4;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_exps = tt::CBIndex::c_24;
+    constexpr auto cb_recipsumexps = tt::CBIndex::c_25;
+    constexpr auto cb_add = tt::CBIndex::c_26;
+    constexpr auto cb_max = tt::CBIndex::c_27;
+    constexpr auto cb_tmp = tt::CBIndex::c_28;
 
     constexpr uint32_t onetile = 1;
     constexpr int dst0 = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h.cpp
index 563ea519371..df58e14630c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h.cpp
@@ -11,15 +11,15 @@
 
 namespace NAMESPACE {
 void MAIN {
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_mask = tt::CB::c_in1;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_exps = tt::CB::c_intermed0;
-    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
-    constexpr auto cb_max = tt::CB::c_intermed2;
-    constexpr auto cb_x_m_max = tt::CB::c_intermed3;
-    constexpr auto cb_tmp = tt::CB::c_intermed4;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_mask = tt::CBIndex::c_1;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_exps = tt::CBIndex::c_24;
+    constexpr auto cb_recipsumexps = tt::CBIndex::c_25;
+    constexpr auto cb_max = tt::CBIndex::c_26;
+    constexpr auto cb_x_m_max = tt::CBIndex::c_27;
+    constexpr auto cb_tmp = tt::CBIndex::c_28;
 
     constexpr int dst0 = 0;
     constexpr int dst1 = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h_large.cpp
index 127b8c82be1..2863552a2cb 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_h_large.cpp
@@ -11,15 +11,15 @@
 
 namespace NAMESPACE {
 void MAIN {
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_mask = tt::CB::c_in1;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_exps = tt::CB::c_intermed0;
-    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
-    constexpr auto cb_add = tt::CB::c_intermed2;
-    constexpr auto cb_max = tt::CB::c_intermed3;
-    constexpr auto cb_tmp = tt::CB::c_intermed4;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_mask = tt::CBIndex::c_1;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_exps = tt::CBIndex::c_24;
+    constexpr auto cb_recipsumexps = tt::CBIndex::c_25;
+    constexpr auto cb_add = tt::CBIndex::c_26;
+    constexpr auto cb_max = tt::CBIndex::c_27;
+    constexpr auto cb_tmp = tt::CBIndex::c_28;
 
     binary_op_init_common(cb_in0, cb_bcast_scaler);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w.cpp
index 81e5bf74be8..5cb9d57d2b3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w.cpp
@@ -12,15 +12,15 @@
 namespace NAMESPACE {
 
 void MAIN {
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_mask = tt::CB::c_in1;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_exps = tt::CB::c_intermed0;
-    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
-    constexpr auto cb_max = tt::CB::c_intermed2;
-    constexpr auto cb_x_m_max = tt::CB::c_intermed3;
-    constexpr auto cb_tmp = tt::CB::c_intermed4;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_mask = tt::CBIndex::c_1;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_exps = tt::CBIndex::c_24;
+    constexpr auto cb_recipsumexps = tt::CBIndex::c_25;
+    constexpr auto cb_max = tt::CBIndex::c_26;
+    constexpr auto cb_x_m_max = tt::CBIndex::c_27;
+    constexpr auto cb_tmp = tt::CBIndex::c_28;
 
     binary_op_init_common(cb_in0, cb_bcast_scaler);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w_large.cpp
index 90f620ce20f..718de9672b9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/moreh_softmax_w_large.cpp
@@ -11,15 +11,15 @@
 
 namespace NAMESPACE {
 void MAIN {
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_mask = tt::CB::c_in1;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_exps = tt::CB::c_intermed0;
-    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
-    constexpr auto cb_add = tt::CB::c_intermed2;
-    constexpr auto cb_max = tt::CB::c_intermed3;
-    constexpr auto cb_tmp = tt::CB::c_intermed4;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_mask = tt::CBIndex::c_1;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_exps = tt::CBIndex::c_24;
+    constexpr auto cb_recipsumexps = tt::CBIndex::c_25;
+    constexpr auto cb_add = tt::CBIndex::c_26;
+    constexpr auto cb_max = tt::CBIndex::c_27;
+    constexpr auto cb_tmp = tt::CBIndex::c_28;
 
     binary_op_init_common(cb_in0, cb_bcast_scaler);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_c_large.cpp
index 2aa6412b95c..56c5d0d1640 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_c_large.cpp
@@ -12,7 +12,7 @@ void kernel_main() {
     uint32_t inner_size = get_arg_val<uint32_t>(4);
     uint32_t dim_size = get_arg_val<uint32_t>(5);
 
-    constexpr auto cb_in = tt::CB::c_in0;
+    constexpr auto cb_in = tt::CBIndex::c_0;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h.cpp
index 2c2d074bc0c..ad89e8648b4 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h.cpp
@@ -13,9 +13,9 @@ void kernel_main() {
     uint32_t scaler = get_arg_val<uint32_t>(5);
     uint32_t mask_h = get_arg_val<uint32_t>(6);
 
-    constexpr auto cb_in = tt::CB::c_in0;
-    constexpr auto cb_mask = tt::CB::c_in1;
-    constexpr auto cb_scaler = tt::CB::c_in2;
+    constexpr auto cb_in = tt::CBIndex::c_0;
+    constexpr auto cb_mask = tt::CBIndex::c_1;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h_large.cpp
index 2cd5a54e485..10e6fac5b21 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_h_large.cpp
@@ -13,9 +13,9 @@ void kernel_main() {
     uint32_t scaler = get_arg_val<uint32_t>(5);
     uint32_t mask_h = get_arg_val<uint32_t>(6);
 
-    constexpr auto cb_in = tt::CB::c_in0;
-    constexpr auto cb_mask = tt::CB::c_in1;
-    constexpr auto cb_scaler = tt::CB::c_in2;
+    constexpr auto cb_in = tt::CBIndex::c_0;
+    constexpr auto cb_mask = tt::CBIndex::c_1;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w.cpp
index 3ea949d6d61..8bce056e512 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w.cpp
@@ -12,9 +12,9 @@ void kernel_main() {
     uint32_t scaler = get_arg_val<uint32_t>(4);
     uint32_t mask_w = get_arg_val<uint32_t>(5);
 
-    constexpr auto cb_in = tt::CB::c_in0;
-    constexpr auto cb_mask = tt::CB::c_in1;
-    constexpr auto cb_scaler = tt::CB::c_in2;
+    constexpr auto cb_in = tt::CBIndex::c_0;
+    constexpr auto cb_mask = tt::CBIndex::c_1;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w_large.cpp
index 188d8ce6e1d..c933790b474 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/reader_moreh_softmax_w_large.cpp
@@ -12,9 +12,9 @@ void kernel_main() {
     uint32_t scaler = get_arg_val<uint32_t>(4);
     uint32_t mask_w = get_arg_val<uint32_t>(5);
 
-    constexpr auto cb_in = tt::CB::c_in0;
-    constexpr auto cb_mask = tt::CB::c_in1;
-    constexpr auto cb_scaler = tt::CB::c_in2;
+    constexpr auto cb_in = tt::CBIndex::c_0;
+    constexpr auto cb_mask = tt::CBIndex::c_1;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_c_large.cpp
index fcec659ae37..aa4202d75ec 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_c_large.cpp
@@ -12,7 +12,7 @@ void kernel_main() {
     uint32_t inner_size = get_arg_val<uint32_t>(4);
     uint32_t dim_size = get_arg_val<uint32_t>(5);
 
-    constexpr auto cb_out = tt::CB::c_out0;
+    constexpr auto cb_out = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_h.cpp
index 78de43d144f..95972415b96 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_h.cpp
@@ -11,7 +11,7 @@ void kernel_main() {
     uint32_t Ht = get_arg_val<uint32_t>(3);
     uint32_t Wt = get_arg_val<uint32_t>(4);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     uint32_t tile_bytes = get_tile_size(cb_id_out);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_h_large.cpp
index bbad6708df3..e87fd58eb1d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_h_large.cpp
@@ -11,7 +11,7 @@ void kernel_main() {
     uint32_t Ht = get_arg_val<uint32_t>(3);
     uint32_t Wt = get_arg_val<uint32_t>(4);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     uint32_t tile_bytes = get_tile_size(cb_id_out);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_w.cpp
index 45cf65005e2..47918451c90 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_w.cpp
@@ -10,7 +10,7 @@ void kernel_main() {
     uint32_t tile_offset = get_arg_val<uint32_t>(2);
     uint32_t Wt = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     uint32_t tile_bytes = get_tile_size(cb_id_out);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_w_large.cpp
index 883562f2c8f..2203d6cf639 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/kernels/writer_moreh_softmax_w_large.cpp
@@ -10,7 +10,7 @@ void kernel_main() {
     uint32_t tile_offset = get_arg_val<uint32_t>(2);
     uint32_t Wt = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     uint32_t tile_bytes = get_tile_size(cb_id_out);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
index 66e371f725f..0ed8c09d927 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_c_large/softmax_c_large.cpp
@@ -51,13 +51,13 @@ MorehSoftmaxOperation::MorehSoftmaxCLargeFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 2},                              // input
-            {tt::CB::c_out0, 2},                             // output
-            {tt::CB::c_intermed0, 1, intermed_data_format},  // exp(x)
-            {tt::CB::c_intermed1, 1, intermed_data_format},  // recips
-            {tt::CB::c_intermed2, 2, intermed_data_format},  // add
-            {tt::CB::c_intermed3, 1},                        // max
-            {tt::CB::c_intermed4, 1, intermed_data_format},  // tmp
+            {tt::CBIndex::c_0, 2},                              // input
+            {tt::CBIndex::c_16, 2},                             // output
+            {tt::CBIndex::c_24, 1, intermed_data_format},  // exp(x)
+            {tt::CBIndex::c_25, 1, intermed_data_format},  // recips
+            {tt::CBIndex::c_26, 2, intermed_data_format},  // add
+            {tt::CBIndex::c_27, 1},                        // max
+            {tt::CBIndex::c_28, 1, intermed_data_format},  // tmp
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
index 1ab8526bf1c..6332052675e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_large/softmax_h_large.cpp
@@ -50,15 +50,15 @@ MorehSoftmaxOperation::MorehSoftmaxHLargeFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 2},                              // input
-            {tt::CB::c_in1, 1},                              // mask
-            {tt::CB::c_in2, 1},                              // scaler
-            {tt::CB::c_out0, 2},                             // output
-            {tt::CB::c_intermed0, 2, intermed_data_format},  // exp(x)
-            {tt::CB::c_intermed1, 1, intermed_data_format},  // reduce
-            {tt::CB::c_intermed2, 1, intermed_data_format},  // syn
-            {tt::CB::c_intermed3, 1, intermed_data_format},  // max
-            {tt::CB::c_intermed4, 1, intermed_data_format},  // tmp
+            {tt::CBIndex::c_0, 2},                              // input
+            {tt::CBIndex::c_1, 1},                              // mask
+            {tt::CBIndex::c_2, 1},                              // scaler
+            {tt::CBIndex::c_16, 2},                             // output
+            {tt::CBIndex::c_24, 2, intermed_data_format},  // exp(x)
+            {tt::CBIndex::c_25, 1, intermed_data_format},  // reduce
+            {tt::CBIndex::c_26, 1, intermed_data_format},  // syn
+            {tt::CBIndex::c_27, 1, intermed_data_format},  // max
+            {tt::CBIndex::c_28, 1, intermed_data_format},  // tmp
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
index 461e7a672d7..b54a134475b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_h_small/softmax_h_small.cpp
@@ -51,15 +51,15 @@ MorehSoftmaxOperation::MorehSoftmaxHSmallFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, Ht},                              // input
-            {tt::CB::c_in1, 1},                               // mask
-            {tt::CB::c_in2, 1},                               // scaler
-            {tt::CB::c_out0, Ht},                             // output
-            {tt::CB::c_intermed0, Ht, intermed_data_format},  // exp(x)
-            {tt::CB::c_intermed1, 1, intermed_data_format},   // reduce
-            {tt::CB::c_intermed2, 1, intermed_data_format},   // max
-            {tt::CB::c_intermed3, Ht, intermed_data_format},  // x - max
-            {tt::CB::c_intermed4, 1, intermed_data_format}    // tmp
+            {tt::CBIndex::c_0, Ht},                              // input
+            {tt::CBIndex::c_1, 1},                               // mask
+            {tt::CBIndex::c_2, 1},                               // scaler
+            {tt::CBIndex::c_16, Ht},                             // output
+            {tt::CBIndex::c_24, Ht, intermed_data_format},  // exp(x)
+            {tt::CBIndex::c_25, 1, intermed_data_format},   // reduce
+            {tt::CBIndex::c_26, 1, intermed_data_format},   // max
+            {tt::CBIndex::c_27, Ht, intermed_data_format},  // x - max
+            {tt::CBIndex::c_28, 1, intermed_data_format}    // tmp
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
index 602229115c2..dab9382bfec 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_large/softmax_w_large.cpp
@@ -51,15 +51,15 @@ MorehSoftmaxOperation::MorehSoftmaxWLargeFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 2},                              // input
-            {tt::CB::c_in1, 1},                              // mask
-            {tt::CB::c_in2, 1},                              // scaler
-            {tt::CB::c_out0, 2},                             // output
-            {tt::CB::c_intermed0, 2, intermed_data_format},  // exp(x)
-            {tt::CB::c_intermed1, 1, intermed_data_format},  // reduce
-            {tt::CB::c_intermed2, 1, intermed_data_format},  // syn
-            {tt::CB::c_intermed3, 1, intermed_data_format},  // max
-            {tt::CB::c_intermed4, 1, intermed_data_format},  // tmp
+            {tt::CBIndex::c_0, 2},                              // input
+            {tt::CBIndex::c_1, 1},                              // mask
+            {tt::CBIndex::c_2, 1},                              // scaler
+            {tt::CBIndex::c_16, 2},                             // output
+            {tt::CBIndex::c_24, 2, intermed_data_format},  // exp(x)
+            {tt::CBIndex::c_25, 1, intermed_data_format},  // reduce
+            {tt::CBIndex::c_26, 1, intermed_data_format},  // syn
+            {tt::CBIndex::c_27, 1, intermed_data_format},  // max
+            {tt::CBIndex::c_28, 1, intermed_data_format},  // tmp
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
index 3d8eed076a9..d5ca67f52a0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax/device/softmax_w_small/softmax_w_small.cpp
@@ -51,15 +51,15 @@ MorehSoftmaxOperation::MorehSoftmaxWSmallFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, Wt},                              // input
-            {tt::CB::c_in1, 1},                               // mask
-            {tt::CB::c_in2, 1},                               // scaler
-            {tt::CB::c_out0, Wt},                             // output
-            {tt::CB::c_intermed0, Wt, intermed_data_format},  // exp(x)
-            {tt::CB::c_intermed1, 1, intermed_data_format},   // reduce
-            {tt::CB::c_intermed2, 1, intermed_data_format},   // max
-            {tt::CB::c_intermed3, Wt, intermed_data_format},  // x - max
-            {tt::CB::c_intermed4, 1, intermed_data_format}    // tmp
+            {tt::CBIndex::c_0, Wt},                              // input
+            {tt::CBIndex::c_1, 1},                               // mask
+            {tt::CBIndex::c_2, 1},                               // scaler
+            {tt::CBIndex::c_16, Wt},                             // output
+            {tt::CBIndex::c_24, Wt, intermed_data_format},  // exp(x)
+            {tt::CBIndex::c_25, 1, intermed_data_format},   // reduce
+            {tt::CBIndex::c_26, 1, intermed_data_format},   // max
+            {tt::CBIndex::c_27, Wt, intermed_data_format},  // x - max
+            {tt::CBIndex::c_28, 1, intermed_data_format}    // tmp
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_c_large.cpp
index 0d72e66f457..f8dd4b1a784 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_c_large.cpp
@@ -14,13 +14,13 @@ namespace NAMESPACE {
 void MAIN {
     constexpr uint32_t onetile = 1;
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_dx = tt::CB::c_out0;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_dx = tt::CBIndex::c_16;
 
-    constexpr auto cb_ydy = tt::CB::c_intermed0;  // y * dy
-    constexpr auto cb_sum = tt::CB::c_intermed1;
-    constexpr auto cb_dy_m_sum = tt::CB::c_intermed2;  // dy - sum
+    constexpr auto cb_ydy = tt::CBIndex::c_24;  // y * dy
+    constexpr auto cb_sum = tt::CBIndex::c_25;
+    constexpr auto cb_dy_m_sum = tt::CBIndex::c_26;  // dy - sum
 
     uint32_t N = get_compile_time_arg_val(0);
     uint32_t dim_size = get_compile_time_arg_val(1);
@@ -40,11 +40,11 @@ void MAIN {
 
         for (uint32_t i = 0; i < dim_size; ++i) {
             // exp(y)
-            constexpr auto cb_exp = tt::CB::c_intermed0;
+            constexpr auto cb_exp = tt::CBIndex::c_24;
             exp_tile_to_cb(cb_y, cb_exp);
 
             // sum * exp(y)
-            constexpr auto cb_inter2 = tt::CB::c_intermed2;
+            constexpr auto cb_inter2 = tt::CBIndex::c_26;
             mul_tiles_to_cb(cb_sum, cb_exp, cb_inter2, 0, 0, /*pop0=*/0, /*pop1=*/1);
 
             // dy - sum * exp(y)
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h.cpp
index 5a6f13aa4af..f873a0c16ae 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h.cpp
@@ -13,15 +13,15 @@ namespace NAMESPACE {
 void MAIN {
     constexpr uint32_t onetile = 1;
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask = tt::CB::c_in3;
-    constexpr auto cb_dx = tt::CB::c_out0;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask = tt::CBIndex::c_3;
+    constexpr auto cb_dx = tt::CBIndex::c_16;
 
-    constexpr auto cb_ydy = tt::CB::c_intermed0;  // y * dy
-    constexpr auto cb_sum = tt::CB::c_intermed1;
-    constexpr auto cb_inter2 = tt::CB::c_intermed2;
+    constexpr auto cb_ydy = tt::CBIndex::c_24;  // y * dy
+    constexpr auto cb_sum = tt::CBIndex::c_25;
+    constexpr auto cb_inter2 = tt::CBIndex::c_26;
 
     binary_op_init_common(cb_y, cb_bcast_scaler);
 
@@ -38,14 +38,14 @@ void MAIN {
             reduce_tile_to_cb<false, REDUCE_OP, REDUCE_DIM>(
                 cb_inter2, cb_bcast_scaler, cb_sum, 1, /*pop0=*/1, /*pop=1*/ 0);
         } else {
-            constexpr auto cb_inter0 = tt::CB::c_intermed0;
+            constexpr auto cb_inter0 = tt::CBIndex::c_24;
             reduce_tile_to_cb<false, REDUCE_OP, REDUCE_DIM>(
                 cb_dy, cb_bcast_scaler, cb_inter0, Ht - 1, /*pop0=*/0, /*pop=1*/ 0);
 
-            constexpr auto cb_inter1 = tt::CB::c_intermed1;
+            constexpr auto cb_inter1 = tt::CBIndex::c_25;
             mask_tile_to_cb(cb_dy, cb_mask, cb_inter1, /*itile=*/Ht - 1, /*mtile=*/0, /*pop=*/0, /*popm=*/0);
 
-            constexpr auto cb_inter2 = tt::CB::c_intermed2;
+            constexpr auto cb_inter2 = tt::CBIndex::c_26;
             reduce_tile_to_cb<false, REDUCE_OP, REDUCE_DIM>(
                 cb_inter1, cb_bcast_scaler, cb_inter2, 1, /*pop0=*/1, /*pop=1*/ 0);
 
@@ -53,7 +53,7 @@ void MAIN {
         }
 
         // dy - sum * exp(y)
-        constexpr auto cb_exp = tt::CB::c_intermed0;  // y * dy
+        constexpr auto cb_exp = tt::CBIndex::c_24;  // y * dy
 
         for (uint32_t w = 0; w < Ht; w += onetile) {
             // exp(y)
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h_large.cpp
index df8473859e1..1bd7eb3c191 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_h_large.cpp
@@ -13,16 +13,16 @@ namespace NAMESPACE {
 void MAIN {
     constexpr uint32_t onetile = 1;
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask = tt::CB::c_in3;
-    constexpr auto cb_dx = tt::CB::c_out0;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask = tt::CBIndex::c_3;
+    constexpr auto cb_dx = tt::CBIndex::c_16;
 
-    constexpr auto cb_ydy = tt::CB::c_intermed0;  // y * dy
-    constexpr auto cb_sum = tt::CB::c_intermed1;
-    constexpr auto cb_inter2 = tt::CB::c_intermed2;
-    constexpr auto cb_add = tt::CB::c_intermed3;
+    constexpr auto cb_ydy = tt::CBIndex::c_24;  // y * dy
+    constexpr auto cb_sum = tt::CBIndex::c_25;
+    constexpr auto cb_inter2 = tt::CBIndex::c_26;
+    constexpr auto cb_add = tt::CBIndex::c_27;
 
     binary_op_init_common(cb_y, cb_bcast_scaler);
 
@@ -37,7 +37,7 @@ void MAIN {
                 if (h == 0) {
                     mask_tile_to_cb(cb_dy, cb_mask, cb_add, /*itile=*/0, /*mtile=*/0, /*pop=*/1, /*popm=*/0);
                 } else {
-                    constexpr auto cb_inter0 = tt::CB::c_intermed0;
+                    constexpr auto cb_inter0 = tt::CBIndex::c_24;
                     mask_tile_to_cb(cb_dy, cb_mask, cb_inter0, /*itile=*/0, /*mtile=*/0, /*pop=*/1, /*popm=*/0);
 
                     add_tiles_to_cb(cb_add, cb_inter0, cb_add);
@@ -55,7 +55,7 @@ void MAIN {
 
         for (uint32_t h = 0; h < Ht; ++h) {
             // exp(y)
-            constexpr auto cb_exp = tt::CB::c_intermed0;
+            constexpr auto cb_exp = tt::CBIndex::c_24;
             exp_tile_to_cb(cb_y, cb_exp, 0);
 
             // sum * exp(y)
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w.cpp
index 77229c3601a..1fc611b0a43 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w.cpp
@@ -13,15 +13,15 @@ namespace NAMESPACE {
 void MAIN {
     constexpr uint32_t onetile = 1;
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask = tt::CB::c_in3;
-    constexpr auto cb_dx = tt::CB::c_out0;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask = tt::CBIndex::c_3;
+    constexpr auto cb_dx = tt::CBIndex::c_16;
 
-    constexpr auto cb_ydy = tt::CB::c_intermed0;  // y * dy
-    constexpr auto cb_sum = tt::CB::c_intermed1;
-    constexpr auto cb_inter2 = tt::CB::c_intermed2;
+    constexpr auto cb_ydy = tt::CBIndex::c_24;  // y * dy
+    constexpr auto cb_sum = tt::CBIndex::c_25;
+    constexpr auto cb_inter2 = tt::CBIndex::c_26;
 
     binary_op_init_common(cb_y, cb_bcast_scaler);
 
@@ -38,14 +38,14 @@ void MAIN {
             reduce_tile_to_cb<false, REDUCE_OP, REDUCE_DIM>(
                 cb_inter2, cb_bcast_scaler, cb_sum, 1, /*pop0=*/1, /*pop=1*/ 0);
         } else {
-            constexpr auto cb_inter0 = tt::CB::c_intermed0;
+            constexpr auto cb_inter0 = tt::CBIndex::c_24;
             reduce_tile_to_cb<false, REDUCE_OP, REDUCE_DIM>(
                 cb_dy, cb_bcast_scaler, cb_inter0, Wt - 1, /*pop0=*/0, /*pop=1*/ 0);
 
-            constexpr auto cb_inter1 = tt::CB::c_intermed1;
+            constexpr auto cb_inter1 = tt::CBIndex::c_25;
             mask_tile_to_cb(cb_dy, cb_mask, cb_inter1, /*itile=*/Wt - 1, /*mtile=*/0, /*pop=*/0, /*popm=*/0);
 
-            constexpr auto cb_inter2 = tt::CB::c_intermed2;
+            constexpr auto cb_inter2 = tt::CBIndex::c_26;
             reduce_tile_to_cb<false, REDUCE_OP, REDUCE_DIM>(
                 cb_inter1, cb_bcast_scaler, cb_inter2, 1, /*pop0=*/1, /*pop=1*/ 0);
 
@@ -53,7 +53,7 @@ void MAIN {
         }
 
         // dy - sum * exp(y)
-        constexpr auto cb_exp = tt::CB::c_intermed0;  // y * dy
+        constexpr auto cb_exp = tt::CBIndex::c_24;  // y * dy
 
         for (uint32_t w = 0; w < Wt; w += onetile) {
             // exp(y)
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w_large.cpp
index 8d5ab1ce109..38ae32f1de9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/moreh_softmax_backward_w_large.cpp
@@ -13,16 +13,16 @@ namespace NAMESPACE {
 void MAIN {
     constexpr uint32_t onetile = 1;
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask = tt::CB::c_in3;
-    constexpr auto cb_dx = tt::CB::c_out0;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask = tt::CBIndex::c_3;
+    constexpr auto cb_dx = tt::CBIndex::c_16;
 
-    constexpr auto cb_ydy = tt::CB::c_intermed0;  // y * dy
-    constexpr auto cb_sum = tt::CB::c_intermed1;
-    constexpr auto cb_inter2 = tt::CB::c_intermed2;
-    constexpr auto cb_add = tt::CB::c_intermed3;
+    constexpr auto cb_ydy = tt::CBIndex::c_24;  // y * dy
+    constexpr auto cb_sum = tt::CBIndex::c_25;
+    constexpr auto cb_inter2 = tt::CBIndex::c_26;
+    constexpr auto cb_add = tt::CBIndex::c_27;
 
     binary_op_init_common(cb_y, cb_bcast_scaler);
 
@@ -37,7 +37,7 @@ void MAIN {
                 if (w == 0) {
                     mask_tile_to_cb(cb_dy, cb_mask, cb_add, /*itile=*/0, /*mtile=*/0, /*pop=*/1, /*popm=*/0);
                 } else {
-                    constexpr auto cb_inter0 = tt::CB::c_intermed0;
+                    constexpr auto cb_inter0 = tt::CBIndex::c_24;
                     mask_tile_to_cb(cb_dy, cb_mask, cb_inter0, /*itile=*/0, /*mtile=*/0, /*pop=*/1, /*popm=*/0);
 
                     add_tiles_to_cb(cb_add, cb_inter0, cb_add);
@@ -55,7 +55,7 @@ void MAIN {
 
         for (uint32_t w = 0; w < Wt; w += onetile) {
             // exp(y)
-            constexpr auto cb_exp = tt::CB::c_intermed0;
+            constexpr auto cb_exp = tt::CBIndex::c_24;
             exp_tile_to_cb(cb_y, cb_exp, 0);
             // sum * exp(y)
             mul_tiles_bcast_cols_to_cb(cb_exp, cb_sum, cb_inter2, 0, 0, /*pop0=*/1, /*pop1=*/0);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_c.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_c.cpp
index 30658b39e68..01f98680fee 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_c.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_c.cpp
@@ -14,8 +14,8 @@ void kernel_main() {
     uint32_t inner_size = get_arg_val<uint32_t>(5);
     uint32_t dim_size = get_arg_val<uint32_t>(6);
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h.cpp
index 962a788dfc8..d57dae337a0 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h.cpp
@@ -16,10 +16,10 @@ void kernel_main() {
     uint32_t scaler = get_arg_val<uint32_t>(6);
     uint32_t mask_h = get_arg_val<uint32_t>(7);
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask = tt::CB::c_in3;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask = tt::CBIndex::c_3;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h_large.cpp
index 91dbd65141b..7714a5018a3 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_h_large.cpp
@@ -16,10 +16,10 @@ void kernel_main() {
     uint32_t scaler = get_arg_val<uint32_t>(6);
     uint32_t mask_h = get_arg_val<uint32_t>(7);
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask = tt::CB::c_in3;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask = tt::CBIndex::c_3;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w.cpp
index 40d2deb0e17..0333c06c407 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w.cpp
@@ -15,10 +15,10 @@ void kernel_main() {
     uint32_t scaler = get_arg_val<uint32_t>(5);
     uint32_t mask_w = get_arg_val<uint32_t>(6);
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask = tt::CB::c_in3;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask = tt::CBIndex::c_3;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w_large.cpp
index 97660dd014a..0432e196056 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/reader_moreh_softmax_backward_w_large.cpp
@@ -15,10 +15,10 @@ void kernel_main() {
     uint32_t scaler = get_arg_val<uint32_t>(5);
     uint32_t mask_w = get_arg_val<uint32_t>(6);
 
-    constexpr auto cb_y = tt::CB::c_in0;
-    constexpr auto cb_dy = tt::CB::c_in1;
-    constexpr auto cb_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask = tt::CB::c_in3;
+    constexpr auto cb_y = tt::CBIndex::c_0;
+    constexpr auto cb_dy = tt::CBIndex::c_1;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask = tt::CBIndex::c_3;
 
     uint32_t l1_write_addr_in;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_c.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_c.cpp
index fcec659ae37..aa4202d75ec 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_c.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_c.cpp
@@ -12,7 +12,7 @@ void kernel_main() {
     uint32_t inner_size = get_arg_val<uint32_t>(4);
     uint32_t dim_size = get_arg_val<uint32_t>(5);
 
-    constexpr auto cb_out = tt::CB::c_out0;
+    constexpr auto cb_out = tt::CBIndex::c_16;
 
     // ublocks size defined in tiles
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_h.cpp
index bbad6708df3..e87fd58eb1d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_h.cpp
@@ -11,7 +11,7 @@ void kernel_main() {
     uint32_t Ht = get_arg_val<uint32_t>(3);
     uint32_t Wt = get_arg_val<uint32_t>(4);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     uint32_t tile_bytes = get_tile_size(cb_id_out);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_w.cpp
index 883562f2c8f..2203d6cf639 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_backward_w.cpp
@@ -10,7 +10,7 @@ void kernel_main() {
     uint32_t tile_offset = get_arg_val<uint32_t>(2);
     uint32_t Wt = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     uint32_t tile_bytes = get_tile_size(cb_id_out);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_h.cpp
index bbad6708df3..e87fd58eb1d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_h.cpp
@@ -11,7 +11,7 @@ void kernel_main() {
     uint32_t Ht = get_arg_val<uint32_t>(3);
     uint32_t Wt = get_arg_val<uint32_t>(4);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     uint32_t tile_bytes = get_tile_size(cb_id_out);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_w.cpp
index 883562f2c8f..2203d6cf639 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/kernels/writer_moreh_softmax_w.cpp
@@ -10,7 +10,7 @@ void kernel_main() {
     uint32_t tile_offset = get_arg_val<uint32_t>(2);
     uint32_t Wt = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     uint32_t tile_bytes = get_tile_size(cb_id_out);
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
index eb37cee2a25..bda18729b8b 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_c_large/softmax_backward_c_large.cpp
@@ -51,12 +51,12 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardCLargeFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 2},                                                                  // y
-            {tt::CB::c_in1, 2},                                                                  // dy
-            {tt::CB::c_out0, 2},                                                                 // dx
-            {tt::CB::c_intermed0, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // y * dy
-            {tt::CB::c_intermed1, 2, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // sum(y * dy)
-            {tt::CB::c_intermed2, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
+            {tt::CBIndex::c_0, 2},                                                                  // y
+            {tt::CBIndex::c_1, 2},                                                                  // dy
+            {tt::CBIndex::c_16, 2},                                                                 // dx
+            {tt::CBIndex::c_24, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // y * dy
+            {tt::CBIndex::c_25, 2, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // sum(y * dy)
+            {tt::CBIndex::c_26, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
index 1a6dc89cfc2..768100c5262 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_large/softmax_backward_h_large.cpp
@@ -51,15 +51,15 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHLargeFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 2},                                                                  // output
-            {tt::CB::c_in1, 2},                                                                  // output_grad
-            {tt::CB::c_in2, 1},                                                                  // scaler
-            {tt::CB::c_in3, 1},                                                                  // mask
-            {tt::CB::c_out0, 2},                                                                 // input_grad
-            {tt::CB::c_intermed0, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // output * output_grad
-            {tt::CB::c_intermed1, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // reduce
-            {tt::CB::c_intermed2, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
-            {tt::CB::c_intermed3,
+            {tt::CBIndex::c_0, 2},                                                                  // output
+            {tt::CBIndex::c_1, 2},                                                                  // output_grad
+            {tt::CBIndex::c_2, 1},                                                                  // scaler
+            {tt::CBIndex::c_3, 1},                                                                  // mask
+            {tt::CBIndex::c_16, 2},                                                                 // input_grad
+            {tt::CBIndex::c_24, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // output * output_grad
+            {tt::CBIndex::c_25, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // reduce
+            {tt::CBIndex::c_26, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
+            {tt::CBIndex::c_27,
              2,
              fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // add(output * output_grad)
         });
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
index 42ccb904e02..a01c029e2b9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_h_small/softmax_backward_h_small.cpp
@@ -51,16 +51,16 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardHSmallFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, Ht},  // output
-            {tt::CB::c_in1, Ht},  // output_grad
-            {tt::CB::c_in2, 1},   // scaler
-            {tt::CB::c_in3, 1},   // mask
-            {tt::CB::c_out0, 2},  // input_grad
-            {tt::CB::c_intermed0,
+            {tt::CBIndex::c_0, Ht},  // output
+            {tt::CBIndex::c_1, Ht},  // output_grad
+            {tt::CBIndex::c_2, 1},   // scaler
+            {tt::CBIndex::c_3, 1},   // mask
+            {tt::CBIndex::c_16, 2},  // input_grad
+            {tt::CBIndex::c_24,
              Ht,
              fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},                          // output * output_grad
-            {tt::CB::c_intermed1, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // reduce
-            {tt::CB::c_intermed2, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
+            {tt::CBIndex::c_25, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // reduce
+            {tt::CBIndex::c_26, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
         });
 
     // create read/wrtie kernel
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
index 8e60f1409e0..d695e55f25a 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_large/softmax_backward_w_large.cpp
@@ -51,15 +51,15 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWLargeFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, 2},                                                                  // output
-            {tt::CB::c_in1, 2},                                                                  // output_grad
-            {tt::CB::c_in2, 1},                                                                  // scaler
-            {tt::CB::c_in3, 1},                                                                  // mask
-            {tt::CB::c_out0, 2},                                                                 // input_grad
-            {tt::CB::c_intermed0, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // output * output_grad
-            {tt::CB::c_intermed1, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // reduce
-            {tt::CB::c_intermed2, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
-            {tt::CB::c_intermed3,
+            {tt::CBIndex::c_0, 2},                                                                  // output
+            {tt::CBIndex::c_1, 2},                                                                  // output_grad
+            {tt::CBIndex::c_2, 1},                                                                  // scaler
+            {tt::CBIndex::c_3, 1},                                                                  // mask
+            {tt::CBIndex::c_16, 2},                                                                 // input_grad
+            {tt::CBIndex::c_24, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // output * output_grad
+            {tt::CBIndex::c_25, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // reduce
+            {tt::CBIndex::c_26, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
+            {tt::CBIndex::c_27,
              2,
              fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // add(output * output_grad)
         });
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
index e6ceac634c4..109ad2b7c77 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_softmax_backward/device/softmax_backward_w_small/softmax_backward_w_small.cpp
@@ -51,16 +51,16 @@ MorehSoftmaxBackwardOperation::MorehSoftmaxBackwardWSmallFactory::create(
         all_cores,
         data_format,
         {
-            {tt::CB::c_in0, Wt},  // output
-            {tt::CB::c_in1, Wt},  // output_grad
-            {tt::CB::c_in2, 1},   // scaler
-            {tt::CB::c_in3, 1},   // mask
-            {tt::CB::c_out0, 2},  // input_grad
-            {tt::CB::c_intermed0,
+            {tt::CBIndex::c_0, Wt},  // output
+            {tt::CBIndex::c_1, Wt},  // output_grad
+            {tt::CBIndex::c_2, 1},   // scaler
+            {tt::CBIndex::c_3, 1},   // mask
+            {tt::CBIndex::c_16, 2},  // input_grad
+            {tt::CBIndex::c_24,
              Wt,
              fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},                          // output * output_grad
-            {tt::CB::c_intermed1, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // reduce
-            {tt::CB::c_intermed2, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
+            {tt::CBIndex::c_25, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // reduce
+            {tt::CBIndex::c_26, 1, fp32_dest_acc_en ? tt::DataFormat::Float32 : data_format},  // dy - sum
         });
     // create read/wrtie kernel
     bool y_is_dram = output.buffer()->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_h_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_h_program_factory.cpp
index 87ae21d0141..459fd208896 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_h_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_h_program_factory.cpp
@@ -85,10 +85,10 @@ MorehSumOperation::MorehSumHIntFactory::cached_program_t MorehSumOperation::More
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},              // input
-            {tt::CB::c_in1, in1_t},              // mask
-            {tt::CB::c_intermed0, intermed0_t},  // accumalated sum
-            {tt::CB::c_out0, out0_t},            // output
+            {tt::CBIndex::c_0, in0_t},              // input
+            {tt::CBIndex::c_1, in1_t},              // mask
+            {tt::CBIndex::c_24, intermed0_t},  // accumalated sum
+            {tt::CBIndex::c_16, out0_t},            // output
         });
     ////////////////////////////////////////////////////////////////////////////
     //                      DataMovementKernel SetUp
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_nc_program_factory.cpp
index 0101f199565..2bd878f07cc 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_nc_program_factory.cpp
@@ -76,9 +76,9 @@ MorehSumOperation::MorehSumNCIntFactory::cached_program_t MorehSumOperation::Mor
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},              // input
-            {tt::CB::c_intermed0, intermed0_t},  // accumulated sum
-            {tt::CB::c_out0, out0_t},            // output
+            {tt::CBIndex::c_0, in0_t},              // input
+            {tt::CBIndex::c_24, intermed0_t},  // accumulated sum
+            {tt::CBIndex::c_16, out0_t},            // output
         });
 
     std::vector<uint32_t> reader_compile_time_args = {static_cast<uint32_t>(is_dram(input))};
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_w_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_w_program_factory.cpp
index eba4fb33fcc..ebd44c81f2f 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_w_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_int_sum_w_program_factory.cpp
@@ -87,10 +87,10 @@ MorehSumOperation::MorehSumWIntFactory::cached_program_t MorehSumOperation::More
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},              // input
-            {tt::CB::c_in1, in1_t},              // mask
-            {tt::CB::c_intermed0, intermed0_t},  // accumalated sum
-            {tt::CB::c_out0, out0_t},            // output
+            {tt::CBIndex::c_0, in0_t},              // input
+            {tt::CBIndex::c_1, in1_t},              // mask
+            {tt::CBIndex::c_24, intermed0_t},  // accumalated sum
+            {tt::CBIndex::c_16, out0_t},            // output
         });
     ////////////////////////////////////////////////////////////////////////////
     //                      DataMovementKernel SetUp
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_int_sum_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_int_sum_h.cpp
index a041bdba7cf..cd10aa79d7c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_int_sum_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_int_sum_h.cpp
@@ -11,10 +11,10 @@ void MAIN {
     constexpr uint32_t Ht = get_compile_time_arg_val(1);
     constexpr uint32_t origin_H = get_compile_time_arg_val(2);
 
-    auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_mask_h = tt::CB::c_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
+    auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_mask_h = tt::CBIndex::c_1;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
     constexpr uint32_t TILE_H = 32;
     constexpr bool do_mask_h = (origin_H % TILE_H) != 0;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp
index 1e89db118d4..3dcdc8b6aa5 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp
@@ -12,12 +12,12 @@ void MAIN {
     uint32_t NC = get_compile_time_arg_val(2);
     constexpr uint32_t origin_H = get_compile_time_arg_val(3);
 
-    auto cb_input = tt::CB::c_in0;
-    constexpr auto cb_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask_h = tt::CB::c_in3;
-    constexpr auto cb_accum_dst = tt::CB::c_intermed0;
-    constexpr auto cb_masked_input = tt::CB::c_intermed1;
-    constexpr auto cb_out = tt::CB::c_out0;
+    auto cb_input = tt::CBIndex::c_0;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask_h = tt::CBIndex::c_3;
+    constexpr auto cb_accum_dst = tt::CBIndex::c_24;
+    constexpr auto cb_masked_input = tt::CBIndex::c_25;
+    constexpr auto cb_out = tt::CBIndex::c_16;
     constexpr uint32_t TILE_H = 32;
     constexpr bool do_mask_h = (origin_H % TILE_H) != 0;
 
@@ -38,7 +38,7 @@ void MAIN {
             // tiles are expected to be coming in in NCWH order (H-contiguous)
             // reducing in W means out[0][w] = sum(h=0..H-1, in[h][w])
             // in this case we just sequentially add to accumulator all the H-tiles in a column
-            cb_input = tt::CB::c_in0;
+            cb_input = tt::CBIndex::c_0;
             bool is_h_single_tile = (Ht == 1);
             if (!is_h_single_tile) {
                 tile_regs_acquire();
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
index 74fddec33e1..40f63124390 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_program_factory.cpp
@@ -80,9 +80,9 @@ MorehSumOperation::MorehSumHFactory::cached_program_t MorehSumOperation::MorehSu
     string compute_kernel_name =
         "ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_h_impl_kernels/moreh_sum_h.cpp";
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     CBHandle cb_src0;
-    uint32_t src1_cb_index = tt::CB::c_in1;
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
     CBHandle cb_src1 = 0;
     uint32_t num_input_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_src0_config =
@@ -91,28 +91,28 @@ MorehSumOperation::MorehSumHFactory::cached_program_t MorehSumOperation::MorehSu
             .set_page_size(src0_cb_index, src0_single_tile_size);
     cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t scaler_cb_index = tt::CB::c_in2;
+    uint32_t scaler_cb_index = tt::CBIndex::c_2;
     tt::tt_metal::CircularBufferConfig cb_scaler_config =
         tt::tt_metal::CircularBufferConfig(1 * scaler_single_tile_size, {{scaler_cb_index, scaler_cb_data_format}})
             .set_page_size(scaler_cb_index, scaler_single_tile_size);
     auto cb_scaler = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_scaler_config);
 
     tt::tt_metal::CircularBufferConfig cb_mask_h_config =
-        tt::tt_metal::CircularBufferConfig(mask_h_single_tile_size, {{tt::CB::c_in3, mask_h_cb_data_format}})
-            .set_page_size(tt::CB::c_in3, mask_h_single_tile_size);
+        tt::tt_metal::CircularBufferConfig(mask_h_single_tile_size, {{tt::CBIndex::c_3, mask_h_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_3, mask_h_single_tile_size);
     auto cb_mask_h = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_mask_h_config);
 
     tt::tt_metal::CircularBufferConfig cb_intermed0_config =
-        tt::tt_metal::CircularBufferConfig(intermed_single_tile_size, {{tt::CB::c_intermed0, intermed_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed0, intermed_single_tile_size);
+        tt::tt_metal::CircularBufferConfig(intermed_single_tile_size, {{tt::CBIndex::c_24, intermed_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_24, intermed_single_tile_size);
     auto cb_intermed0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed0_config);
 
     tt::tt_metal::CircularBufferConfig cb_intermed1_config =
-        tt::tt_metal::CircularBufferConfig(intermed_single_tile_size, {{tt::CB::c_intermed1, intermed1_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed1, intermed_single_tile_size);
+        tt::tt_metal::CircularBufferConfig(intermed_single_tile_size, {{tt::CBIndex::c_25, intermed1_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_25, intermed_single_tile_size);
     auto cb_intermed1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed1_config);
 
-    uint32_t output_cb_index = tt::CB::c_out0;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;  // output operands start at index 16
     CBHandle cb_output;
     uint32_t num_output_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_output_config =
@@ -162,7 +162,7 @@ MorehSumOperation::MorehSumHFactory::cached_program_t MorehSumOperation::MorehSu
 
     std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (fp32_dest_acc_en) {
-        unpack_to_dest_mode[tt::CB::c_intermed0] = UnpackToDestMode::UnpackToDestFp32;
+        unpack_to_dest_mode[tt::CBIndex::c_24] = UnpackToDestMode::UnpackToDestFp32;
     }
     auto reduce_compute_kernel_group_1_id = tt::tt_metal::CreateKernel(
         program,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_int_sum_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_int_sum_nc.cpp
index 5f5e9a58b32..cc496e6891d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_int_sum_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_int_sum_nc.cpp
@@ -11,9 +11,9 @@ void MAIN {
     constexpr uint32_t num_output_tiles = get_compile_time_arg_val(0);
     constexpr uint32_t num_input_tiles = get_compile_time_arg_val(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
     constexpr int onetile = 1;
     constexpr int idx0 = 0;
     constexpr int dst0 = 0;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp
index b4a35226e8c..9af0e3630ec 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc.cpp
@@ -10,9 +10,9 @@ void MAIN {
     constexpr uint32_t num_output_tiles = get_compile_time_arg_val(0);
     constexpr uint32_t num_input_tiles = get_compile_time_arg_val(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc_gs.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc_gs.cpp
index 9de6be93814..5d0973385de 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc_gs.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_impl_kernels/moreh_sum_nc_gs.cpp
@@ -10,10 +10,10 @@ void MAIN {
     constexpr uint32_t num_output_tiles = get_compile_time_arg_val(0);
     constexpr uint32_t num_input_tiles = get_compile_time_arg_val(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
index a2afebfabb7..263368cb2d9 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_nc_program_factory.cpp
@@ -74,10 +74,10 @@ MorehSumOperation::MorehSumNCFactory::cached_program_t MorehSumOperation::MorehS
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, in0_t},  // input
-            {tt::CB::c_in1, in1_t},  // zero
-            {tt::CB::c_intermed0, intermed0_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format},
-            {tt::CB::c_out0, out0_t},  // output
+            {tt::CBIndex::c_0, in0_t},  // input
+            {tt::CBIndex::c_1, in1_t},  // zero
+            {tt::CBIndex::c_24, intermed0_t, (fp32_dest_acc_en) ? tt::DataFormat::Float32 : cb_data_format},
+            {tt::CBIndex::c_16, out0_t},  // output
         });
     ////////////////////////////////////////////////////////////////////////////
     //                      DataMovementKernel SetUp
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_int_sum_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_int_sum_w.cpp
index 2795b5676cc..03ccff8088c 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_int_sum_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_int_sum_w.cpp
@@ -12,10 +12,10 @@ void MAIN {
     constexpr uint32_t Wt = get_compile_time_arg_val(1);
     constexpr uint32_t origin_W = get_compile_time_arg_val(2);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_mask_w = tt::CB::c_in1;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_mask_w = tt::CBIndex::c_1;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     constexpr uint32_t TILE_W = 32;
     constexpr bool do_mask_w = (origin_W % TILE_W) != 0;
     constexpr int onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_sum_w.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_sum_w.cpp
index 7a1be1b2a95..7ea6b595229 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_sum_w.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_impl_kernels/moreh_sum_w.cpp
@@ -12,12 +12,12 @@ void MAIN {
     uint32_t NC = get_compile_time_arg_val(2);
     constexpr uint32_t origin_W = get_compile_time_arg_val(3);
 
-    auto cb_input = tt::CB::c_in0;
-    constexpr auto cb_scaler = tt::CB::c_in2;
-    constexpr auto cb_mask_w = tt::CB::c_in3;
-    constexpr auto cb_accum_dst = tt::CB::c_intermed0;
-    constexpr auto cb_masked_input = tt::CB::c_intermed1;
-    constexpr auto cb_out = tt::CB::c_out0;
+    auto cb_input = tt::CBIndex::c_0;
+    constexpr auto cb_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_mask_w = tt::CBIndex::c_3;
+    constexpr auto cb_accum_dst = tt::CBIndex::c_24;
+    constexpr auto cb_masked_input = tt::CBIndex::c_25;
+    constexpr auto cb_out = tt::CBIndex::c_16;
     constexpr uint32_t TILE_W = 32;
     constexpr bool do_mask_w = (origin_W % TILE_W) != 0;
 
@@ -38,7 +38,7 @@ void MAIN {
             // tiles are expected to be coming in in NCHW order (W-contiguous)
             // reducing in W means out[h][0] = sum(w=0..W-1, in[h][w])
             // in this case we just sequentially add to accumulator all the W-tiles in a row
-            cb_input = tt::CB::c_in0;
+            cb_input = tt::CBIndex::c_0;
             bool is_w_single_tile = (Wt == 1);
             if (!is_w_single_tile) {
                 tile_regs_acquire();
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_program_factory.cpp
index d6a63e5754b..5c9c0820700 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum/device/moreh_sum_w_program_factory.cpp
@@ -90,26 +90,26 @@ MorehSumOperation::MorehSumWFactory::cached_program_t MorehSumOperation::MorehSu
 
     tt::tt_metal::CircularBufferConfig cb_scaler_config =
         tt::tt_metal::CircularBufferConfig(
-            num_input_tiles * scaler_single_tile_size, {{tt::CB::c_in2, scaler_cb_data_format}})
-            .set_page_size(tt::CB::c_in2, scaler_single_tile_size);
+            num_input_tiles * scaler_single_tile_size, {{tt::CBIndex::c_2, scaler_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_2, scaler_single_tile_size);
     auto cb_scaler = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_scaler_config);
 
     tt::tt_metal::CircularBufferConfig cb_mask_w_config =
-        tt::tt_metal::CircularBufferConfig(mask_w_single_tile_size, {{tt::CB::c_in3, mask_w_cb_data_format}})
-            .set_page_size(tt::CB::c_in3, mask_w_single_tile_size);
+        tt::tt_metal::CircularBufferConfig(mask_w_single_tile_size, {{tt::CBIndex::c_3, mask_w_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_3, mask_w_single_tile_size);
     auto cb_mask_w = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_mask_w_config);
 
     tt::tt_metal::CircularBufferConfig cb_intermed0_config =
-        tt::tt_metal::CircularBufferConfig(intermed_single_tile_size, {{tt::CB::c_intermed0, intermed_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed0, intermed_single_tile_size);
+        tt::tt_metal::CircularBufferConfig(intermed_single_tile_size, {{tt::CBIndex::c_24, intermed_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_24, intermed_single_tile_size);
     auto cb_intermed0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed0_config);
 
     tt::tt_metal::CircularBufferConfig cb_intermed1_config =
-        tt::tt_metal::CircularBufferConfig(intermed_single_tile_size, {{tt::CB::c_intermed1, intermed1_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed1, intermed_single_tile_size);
+        tt::tt_metal::CircularBufferConfig(intermed_single_tile_size, {{tt::CBIndex::c_25, intermed1_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_25, intermed_single_tile_size);
     auto cb_intermed1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed1_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt::tt_metal::CircularBufferConfig cb_output_config =
         tt::tt_metal::CircularBufferConfig(
@@ -155,7 +155,7 @@ MorehSumOperation::MorehSumWFactory::cached_program_t MorehSumOperation::MorehSu
 
     std::vector<UnpackToDestMode> unpack_to_dest_mode(NUM_CIRCULAR_BUFFERS, UnpackToDestMode::Default);
     if (fp32_dest_acc_en) {
-        unpack_to_dest_mode[tt::CB::c_intermed0] = UnpackToDestMode::UnpackToDestFp32;
+        unpack_to_dest_mode[tt::CBIndex::c_24] = UnpackToDestMode::UnpackToDestFp32;
     }
     auto reduce_compute_kernel_group_1_id = tt::tt_metal::CreateKernel(
         program,
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/moreh_sum_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/moreh_sum_backward.cpp
index 22439434af7..b0d82e5f833 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/moreh_sum_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/moreh_sum_backward.cpp
@@ -10,9 +10,9 @@ void MAIN {
     constexpr bool wt_need_bcast = (get_compile_time_arg_val(1) == 1);
     constexpr bool ht_need_bcast = (get_compile_time_arg_val(2) == 1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;  // input
-    constexpr auto cb_in1 = tt::CB::c_in1;  // zero tile
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;  // input
+    constexpr auto cb_in1 = tt::CBIndex::c_1;  // zero tile
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
 
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/writer_moreh_sum_backward.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/writer_moreh_sum_backward.cpp
index 1481598985e..cdd7fbce66e 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/writer_moreh_sum_backward.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/kernels/writer_moreh_sum_backward.cpp
@@ -14,7 +14,7 @@ void kernel_main() {
     const auto num_tiles = arg_fetcher.get_next_arg_val<uint32_t>();
     const auto start_id = arg_fetcher.get_next_arg_val<uint32_t>();
 
-    constexpr uint32_t cb_id_out = tt::CB::c_out0;
+    constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
 
     uint32_t input_grad_tile_bytes = get_tile_size(cb_id_out);
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_program_factory.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_program_factory.cpp
index 5ad68986a50..bdee0939c4d 100644
--- a/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/moreh/moreh_sum_backward/device/moreh_sum_backward_program_factory.cpp
@@ -134,9 +134,9 @@ MorehSumBackwardOperation::ProgramFactory::cached_program_t MorehSumBackwardOper
         all_cores,
         cb_data_format,
         {
-            {tt::CB::c_in0, 2},   // input
-            {tt::CB::c_in1, 1},   // zero
-            {tt::CB::c_out0, 2},  // output
+            {tt::CBIndex::c_0, 2},   // input
+            {tt::CBIndex::c_1, 1},   // zero
+            {tt::CBIndex::c_16, 2},  // output
         });
 
     ////////////////////////////////////////////////////////////////////////////
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp
index 898d3d5babc..61e022bca16 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/groupnorm_op.cpp
@@ -13,6 +13,7 @@
 
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::normalization {
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/compute/groupnorm_sharded_v2.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/compute/groupnorm_sharded_v2.cpp
index fe8006441c9..a7882e436f3 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/compute/groupnorm_sharded_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/compute/groupnorm_sharded_v2.cpp
@@ -69,34 +69,34 @@ void MAIN {
     constexpr uint32_t scaler0 = 0;
 
     // input cbs
-    constexpr uint32_t cb_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_in = tt::CB::c_intermed5;
-    constexpr uint32_t cb_scaler = tt::CB::c_in2;
-    constexpr uint32_t cb_scaler_global = tt::CB::c_in4;
-    constexpr uint32_t cb_eps = tt::CB::c_in3;
-    constexpr uint32_t cb_gamma = tt::CB::c_in5;
-    constexpr uint32_t cb_beta = tt::CB::c_in6;
-    constexpr uint32_t cb_input_mask = tt::CB::c_intermed4;
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_in = tt::CBIndex::c_29;
+    constexpr uint32_t cb_scaler = tt::CBIndex::c_2;
+    constexpr uint32_t cb_scaler_global = tt::CBIndex::c_4;
+    constexpr uint32_t cb_eps = tt::CBIndex::c_3;
+    constexpr uint32_t cb_gamma = tt::CBIndex::c_5;
+    constexpr uint32_t cb_beta = tt::CBIndex::c_6;
+    constexpr uint32_t cb_input_mask = tt::CBIndex::c_28;
 
     // interm cbs
-    constexpr uint32_t cb_repack = tt::CB::c_intermed2;
-    constexpr uint32_t cb_repack_out = tt::CB::c_intermed7;
-    constexpr uint32_t cb_x = tt::CB::c_intermed0;
-    constexpr uint32_t cb_xmm = tt::CB::c_intermed1;
-    constexpr uint32_t cb_ex_partial = tt::CB::dataflow0;
-    constexpr uint32_t cb_ex = tt::CB::dataflow1;
-    constexpr uint32_t cb_ex_external = tt::CB::dataflow2;
-    constexpr uint32_t cb_ex_global = num_cores_per_mcast_group == 1 ? cb_ex_partial : tt::CB::dataflow7;
-    constexpr uint32_t cb_ex2pe = tt::CB::c_intermed3;
+    constexpr uint32_t cb_repack = tt::CBIndex::c_26;
+    constexpr uint32_t cb_repack_out = tt::CBIndex::c_31;
+    constexpr uint32_t cb_x = tt::CBIndex::c_24;
+    constexpr uint32_t cb_xmm = tt::CBIndex::c_25;
+    constexpr uint32_t cb_ex_partial = tt::CBIndex::c_8;
+    constexpr uint32_t cb_ex = tt::CBIndex::c_9;
+    constexpr uint32_t cb_ex_external = tt::CBIndex::c_10;
+    constexpr uint32_t cb_ex_global = num_cores_per_mcast_group == 1 ? cb_ex_partial : tt::CBIndex::c_15;
+    constexpr uint32_t cb_ex2pe = tt::CBIndex::c_27;
 
     // interm cbs reuse
     constexpr uint32_t cb_fusion = cb_xmm;
     constexpr uint32_t cb_xmm2 = cb_x;
 
     // output cb
-    constexpr uint32_t cb_out0 = tt::CB::c_out0;
+    constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
     #ifdef UNTILIZE_OUT
-    constexpr uint32_t cb_out = tt::CB::c_intermed6;
+    constexpr uint32_t cb_out = tt::CBIndex::c_30;
     #else
     constexpr uint32_t cb_out = (do_gamma or do_beta) ? (((do_gamma and not do_beta) or (not do_gamma and do_beta)) ? cb_in : cb_out0) : cb_out0;
     #endif
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_gn_v2.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_gn_v2.cpp
index 288c710d691..463a2a8bb31 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_gn_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_gn_v2.cpp
@@ -25,13 +25,13 @@ void kernel_main() {
     const uint32_t mcast_sender_noc_x         = get_arg_val<uint32_t>(0);
     const uint32_t mcast_sender_noc_y         = get_arg_val<uint32_t>(1);
 
-    constexpr uint32_t cb_ex_partial = tt::CB::dataflow0; // E[x] partial reduce
-    constexpr uint32_t cb_ex = tt::CB::dataflow1; // E[x] partial reduce
-    constexpr uint32_t cb_ex_global = tt::CB::dataflow7; // E[x] global reduce
-    constexpr uint32_t cb_in0 = tt::CB::c_in0; // sharded cb
-    constexpr uint32_t cb_repack = tt::CB::c_intermed2;
-    constexpr uint32_t cb_repack_out = tt::CB::c_intermed7;
-    constexpr uint32_t cb_out0 = tt::CB::c_out0;
+    constexpr uint32_t cb_ex_partial = tt::CBIndex::c_8; // E[x] partial reduce
+    constexpr uint32_t cb_ex = tt::CBIndex::c_9; // E[x] partial reduce
+    constexpr uint32_t cb_ex_global = tt::CBIndex::c_15; // E[x] global reduce
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0; // sharded cb
+    constexpr uint32_t cb_repack = tt::CBIndex::c_26;
+    constexpr uint32_t cb_repack_out = tt::CBIndex::c_31;
+    constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
 
     const uint32_t single_tile_size_bytes = get_tile_size(cb_ex_partial); // tile size
     const DataFormat data_format = get_dataformat(cb_ex_partial); // data format
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_gn_v2.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_gn_v2.cpp
index 3430f87e373..2cecc718815 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_gn_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_gn_v2.cpp
@@ -148,13 +148,13 @@ void kernel_main() {
     *reduce_sender_semaphore_addr_ptr = VALID;
     volatile tt_l1_ptr uint32_t* reduce_receiver_semaphore_addr_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(reduce_receiver_semaphore_addr);
 
-    constexpr uint32_t cb_ex_partial = tt::CB::dataflow0;
-    constexpr uint32_t cb_ex = tt::CB::dataflow1;
-    constexpr uint32_t cb_ex_external = tt::CB::dataflow2;
-    constexpr uint32_t cb_in0 = tt::CB::c_in0; // sharded cb
-    constexpr uint32_t cb_repack = tt::CB::c_intermed2;
-    constexpr uint32_t cb_repack_out = tt::CB::c_intermed7;
-    constexpr uint32_t cb_out0 = tt::CB::c_out0;
+    constexpr uint32_t cb_ex_partial = tt::CBIndex::c_8;
+    constexpr uint32_t cb_ex = tt::CBIndex::c_9;
+    constexpr uint32_t cb_ex_external = tt::CBIndex::c_10;
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0; // sharded cb
+    constexpr uint32_t cb_repack = tt::CBIndex::c_26;
+    constexpr uint32_t cb_repack_out = tt::CBIndex::c_31;
+    constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
 
     const uint32_t single_tile_size_bytes = get_tile_size(cb_ex_partial);
     const DataFormat data_format = get_dataformat(cb_ex_partial);
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/writer_unary_sharded_gn_rm_gb_v2.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/writer_unary_sharded_gn_rm_gb_v2.cpp
index 362d7751285..3a2772c7901 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/writer_unary_sharded_gn_rm_gb_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/writer_unary_sharded_gn_rm_gb_v2.cpp
@@ -43,10 +43,10 @@ void kernel_main() {
     const uint32_t beta_tile_start_id             = get_arg_val<uint32_t>(7);
     const uint32_t input_mask_tile_start_id             = get_arg_val<uint32_t>(8);
 
-    constexpr uint32_t cb_gamma = tt::CB::c_in5;
-    constexpr uint32_t cb_beta = tt::CB::c_in6;
-    constexpr uint32_t cb_out0 = tt::CB::c_out0;
-    constexpr uint32_t cb_input_mask = tt::CB::c_intermed4;
+    constexpr uint32_t cb_gamma = tt::CBIndex::c_5;
+    constexpr uint32_t cb_beta = tt::CBIndex::c_6;
+    constexpr uint32_t cb_out0 = tt::CBIndex::c_16;
+    constexpr uint32_t cb_input_mask = tt::CBIndex::c_28;
 
     // constexpr uint32_t block_w = 4;
     const uint32_t single_tile_size_bytes = get_tile_size(cb_gamma);
@@ -74,18 +74,18 @@ void kernel_main() {
             cb_push_back(cb_input_mask, block_w);
 
             if (i == 0 and b == 0) {
-                constexpr uint32_t cb_in_2 = tt::CB::c_in2;
+                constexpr uint32_t cb_in_2 = tt::CBIndex::c_2;
                 const uint32_t scalar_w = get_arg_val<uint32_t>(1);
                 generate_reduce_scaler(cb_in_2, scalar_w);
 
 
                 if constexpr(is_mcast_sender) {
-                    constexpr uint32_t cb_in_4 = tt::CB::c_in4;
+                    constexpr uint32_t cb_in_4 = tt::CBIndex::c_4;
                     const uint32_t scalar_c = get_arg_val<uint32_t>(0);
                     generate_reduce_scaler(cb_in_4, scalar_c);
                 }
 
-                constexpr uint32_t eps_cb_id = tt::CB::c_in3;
+                constexpr uint32_t eps_cb_id = tt::CBIndex::c_3;
                 const uint32_t eps = get_arg_val<uint32_t>(2);
                 generate_bcast_col_scalar(eps_cb_id, eps);
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
index 1c5758279f9..ea9c14c52c9 100644
--- a/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/groupnorm/device/multi_core/groupnorm_op_multi_core.cpp
@@ -15,6 +15,7 @@
 
 using uint32_t = std::uint32_t;
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::normalization {
 
@@ -646,8 +647,8 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded(
         tt::tt_metal::ComputeConfig{.math_fidelity = fidelity, .fp32_dest_acc_en = fp32_dest_acc_en, .math_approx_mode = math_approx_mode, .compile_args = mcast_receiver_compute_compile_time_args, .defines = eltwise_binary_defines}
     );
     // Create circular buffers
-    uint32_t in0_cb_index = tt::CB::c_in0;
-    uint32_t output_cb_index = tt::CB::c_out0;
+    uint32_t in0_cb_index = tt::CBIndex::c_0;
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     CBHandle cb_in0;
     CBHandle cb_output;
     if (inplace) {
@@ -673,56 +674,56 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded(
     }
 
     // in - stores tilized input
-    uint32_t in_cb_index = tt::CB::c_intermed5;
+    uint32_t in_cb_index = tt::CBIndex::c_29;
     tt::tt_metal::CircularBufferConfig in_cb_config = tt::tt_metal::CircularBufferConfig(in_CB_size, {{in_cb_index, in_data_format}})
         .set_page_size(in_cb_index, in_single_tile_size);
     auto cb_in = tt::tt_metal::CreateCircularBuffer(program, all_cores, in_cb_config);
     // out - stores tilized output
     if (untilize_out) {
-        uint32_t out_cb_index = tt::CB::c_intermed6;
+        uint32_t out_cb_index = tt::CBIndex::c_30;
         tt::tt_metal::CircularBufferConfig out_cb_config = tt::tt_metal::CircularBufferConfig(in_CB_size, {{out_cb_index, in_data_format}})
             .set_page_size(out_cb_index, in_single_tile_size);
         auto cb_out = tt::tt_metal::CreateCircularBuffer(program, all_cores, out_cb_config);
     }
     // in2 scaler - for partial Ex
-    uint32_t in2_cb_index = tt::CB::c_in2;
+    uint32_t in2_cb_index = tt::CBIndex::c_2;
     tt::tt_metal::CircularBufferConfig in2_cb_config = tt::tt_metal::CircularBufferConfig(in2_CB_size, {{in2_cb_index, cb_data_format}})
 		.set_page_size(in2_cb_index, single_tile_size);
     auto cb_in2 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in2_cb_config);
     // in3 eps
-    uint32_t in3_cb_index = tt::CB::c_in3;
+    uint32_t in3_cb_index = tt::CBIndex::c_3;
     tt::tt_metal::CircularBufferConfig in3_cb_config = tt::tt_metal::CircularBufferConfig(in3_CB_size, {{in3_cb_index, cb_data_format}})
 		.set_page_size(in3_cb_index, single_tile_size);
     auto cb_in3 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in3_cb_config);
     // in4 scaler-c
-    uint32_t in4_cb_index = tt::CB::c_in4;
+    uint32_t in4_cb_index = tt::CBIndex::c_4;
     tt::tt_metal::CircularBufferConfig in4_cb_config = tt::tt_metal::CircularBufferConfig(in2_CB_size, {{in4_cb_index, cb_data_format}})
 		.set_page_size(in4_cb_index, single_tile_size);
     auto cb_in4 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in4_cb_config);
     // gamma
     if (gamma.has_value()) {
-        uint32_t in5_cb_index = tt::CB::c_in5;
+        uint32_t in5_cb_index = tt::CBIndex::c_5;
         tt::tt_metal::CircularBufferConfig in5_cb_config = tt::tt_metal::CircularBufferConfig(in5_CB_size, {{in5_cb_index, gamma_beta_cb_data_format}})
             .set_page_size(in5_cb_index, gamma_beta_single_tile_size);
         auto cb_in5 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in5_cb_config);
     }
     // beta
     if (beta.has_value()) {
-        uint32_t in6_cb_index = tt::CB::c_in6;
+        uint32_t in6_cb_index = tt::CBIndex::c_6;
         tt::tt_metal::CircularBufferConfig in6_cb_config = tt::tt_metal::CircularBufferConfig(in6_CB_size, {{in6_cb_index, gamma_beta_cb_data_format}})
             .set_page_size(in6_cb_index, gamma_beta_single_tile_size);
         auto cb_in6 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in6_cb_config);
     }
     // input mask
     if (input_mask.has_value()) {
-        uint32_t in_mask_cb_index = tt::CB::c_intermed4;
+        uint32_t in_mask_cb_index = tt::CBIndex::c_28;
         tt::tt_metal::CircularBufferConfig in_mask_cb_config = tt::tt_metal::CircularBufferConfig(in_mask_CB_size, {{in_mask_cb_index, in_mask_cb_data_format}})
             .set_page_size(in_mask_cb_index, in_mask_single_tile_size);
         auto cb_inz = tt::tt_metal::CreateCircularBuffer(program, all_cores, in_mask_cb_config);
     }
     if (reader_repack_output) {
-        uint32_t repack_cb_index = tt::CB::c_intermed2;
-        uint32_t repack_out_cb_index = tt::CB::c_intermed7;
+        uint32_t repack_cb_index = tt::CBIndex::c_26;
+        uint32_t repack_out_cb_index = tt::CBIndex::c_31;
         std::map<uint8_t, tt::DataFormat> in0_out0_cb_data_format_spec {
             {repack_cb_index, in_data_format},
             {repack_out_cb_index, in_data_format}
@@ -733,28 +734,28 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded(
         auto cb_inz = tt::tt_metal::CreateCircularBuffer(program, all_cores, repack_cb_config);
     }
     // x
-    uint32_t x_cb_index = tt::CB::c_intermed0;
+    uint32_t x_cb_index = tt::CBIndex::c_24;
     tt::tt_metal::CircularBufferConfig x_cb_config = tt::tt_metal::CircularBufferConfig(x_CB_size, {{x_cb_index, cb_data_format}})
         .set_page_size(x_cb_index, single_tile_size);
     auto cb_x = tt::tt_metal::CreateCircularBuffer(program, all_cores, x_cb_config);
     // xmm
-    uint32_t xmm_cb_index = tt::CB::c_intermed1;
+    uint32_t xmm_cb_index = tt::CBIndex::c_25;
     tt::tt_metal::CircularBufferConfig xmm_cb_config = tt::tt_metal::CircularBufferConfig(xmm_CB_size, {{xmm_cb_index, cb_data_format}})
         .set_page_size(xmm_cb_index, single_tile_size);
     auto cb_xmm = tt::tt_metal::CreateCircularBuffer(program, all_cores, xmm_cb_config);
     // ex_partial
-    uint32_t ex_cb_partial_index = tt::CB::dataflow0;
+    uint32_t ex_cb_partial_index = tt::CBIndex::c_8;
     tt::tt_metal::CircularBufferConfig ex_cb_partial_config = tt::tt_metal::CircularBufferConfig(ex_partial_CB_size, {{ex_cb_partial_index, cb_data_format}})
 		.set_page_size(ex_cb_partial_index, single_tile_size);
     auto cb_ex_partial = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_cb_partial_config);
     // ex_external
-    uint32_t ex_cb_external_index = tt::CB::dataflow2;
+    uint32_t ex_cb_external_index = tt::CBIndex::c_10;
     tt::tt_metal::CircularBufferConfig ex_cb_external_config = tt::tt_metal::CircularBufferConfig(single_tile_size * num_cores_per_mcast_group, {{ex_cb_external_index, cb_data_format}})
 		.set_page_size(ex_cb_external_index, single_tile_size);
     auto cb_ex_external = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_cb_external_config);
     // ex_global
-    uint32_t ex_cb_index = tt::CB::dataflow1;
-    uint32_t ex_global_cb_index = tt::CB::dataflow7;
+    uint32_t ex_cb_index = tt::CBIndex::c_9;
+    uint32_t ex_global_cb_index = tt::CBIndex::c_15;
     std::map<uint8_t, tt::DataFormat> ex_global_cb_data_format_spec {
         {ex_global_cb_index, cb_data_format},
         {ex_cb_index, cb_data_format}
@@ -765,7 +766,7 @@ operation::ProgramWithCallbacks groupnorm_multi_core_sharded(
     auto cb_ex_global = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_global_cb_config);
     // ex2pe
     uint32_t cb_ex2pe_index;
-    cb_ex2pe_index = tt::CB::c_intermed3;
+    cb_ex2pe_index = tt::CBIndex::c_27;
     tt::tt_metal::CircularBufferConfig ex2pe_cb_config = tt::tt_metal::CircularBufferConfig(ex2pe_CB_size, {{cb_ex2pe_index, cb_data_format}})
         .set_page_size(cb_ex2pe_index, single_tile_size);
     auto cb_ex2pe = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex2pe_cb_config);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp
index c73ff75b0b7..221935a20d3 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp
@@ -34,29 +34,29 @@ void MAIN {
     // TODO(AP): check that if DST is indeed zeroed by release_dst (and initially), we can use it as zeroes
 
     // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct
-    constexpr auto cb_scaler = tt::CB::c_in2; // single tile generated by the reader
-    constexpr auto cb_eps = tt::CB::c_in3; // single tile generated by the reader
-    constexpr auto cb_in = tt::CB::c_in0; // input x or a for fused pre-add (x=a+b)
-    constexpr auto cb_inb = tt::CB::c_in1; // input b for fused pre-add
-    constexpr auto cb_out = tt::CB::c_out0; // output
-    constexpr auto cb_gamma = tt::CB::c_in5;
-    constexpr auto cb_beta = tt::CB::c_in6;
+    constexpr auto cb_scaler = tt::CBIndex::c_2; // single tile generated by the reader
+    constexpr auto cb_eps = tt::CBIndex::c_3; // single tile generated by the reader
+    constexpr auto cb_in = tt::CBIndex::c_0; // input x or a for fused pre-add (x=a+b)
+    constexpr auto cb_inb = tt::CBIndex::c_1; // input b for fused pre-add
+    constexpr auto cb_out = tt::CBIndex::c_16; // output
+    constexpr auto cb_gamma = tt::CBIndex::c_5;
+    constexpr auto cb_beta = tt::CBIndex::c_6;
     #if defined RMSNORM and not defined FUSE_PRE_ADD
     constexpr uint32_t cb_xmm = cb_in; // x minus mean
     #else
-    constexpr uint32_t cb_xmm = tt::CB::c_intermed0; // x minus mean
+    constexpr uint32_t cb_xmm = tt::CBIndex::c_24; // x minus mean
     #endif
-    constexpr auto cb_ex = tt::CB::c_intermed1; // E[x]
-    constexpr auto cb_ex2 = tt::CB::c_intermed2; // E[(x-E[x])^2]
-    constexpr auto cb_xmm2 = tt::CB::c_intermed3; // xmm^2
-    constexpr auto cb_ex2pe = tt::CB::c_intermed4; // E[(x-E[x])^2]+eps
-    constexpr auto cb_fusion = tt::CB::c_intermed5; // stream gamma/beta
+    constexpr auto cb_ex = tt::CBIndex::c_25; // E[x]
+    constexpr auto cb_ex2 = tt::CBIndex::c_26; // E[(x-E[x])^2]
+    constexpr auto cb_xmm2 = tt::CBIndex::c_27; // xmm^2
+    constexpr auto cb_ex2pe = tt::CBIndex::c_28; // E[(x-E[x])^2]+eps
+    constexpr auto cb_fusion = tt::CBIndex::c_29; // stream gamma/beta
     constexpr auto scaler0 = 0;
     #ifdef FUSE_PRE_ADD
     #ifdef RMSNORM
     constexpr uint32_t cb_x = cb_xmm;
     #else
-    constexpr uint32_t cb_x = tt::CB::c_intermed6;
+    constexpr uint32_t cb_x = tt::CBIndex::c_30;
     #endif
     #else
     constexpr uint32_t cb_x = cb_in;
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded.cpp
index 87d9c012c81..b2aff1d2031 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded.cpp
@@ -55,30 +55,30 @@ void MAIN {
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t scaler0 = 0;
 
-    constexpr uint32_t cb_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_in1 = tt::CB::c_in1;
-    constexpr uint32_t cb_scaler = tt::CB::c_in2;
-    constexpr uint32_t cb_eps = tt::CB::c_in3;
-    constexpr uint32_t cb_scaler_global = tt::CB::c_in4;
-    constexpr uint32_t cb_gamma = tt::CB::c_in5;
-    constexpr uint32_t cb_beta = tt::CB::c_in6;
-    constexpr uint32_t cb_x = tt::CB::c_intermed0; // x minus mean
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_in1 = tt::CBIndex::c_1;
+    constexpr uint32_t cb_scaler = tt::CBIndex::c_2;
+    constexpr uint32_t cb_eps = tt::CBIndex::c_3;
+    constexpr uint32_t cb_scaler_global = tt::CBIndex::c_4;
+    constexpr uint32_t cb_gamma = tt::CBIndex::c_5;
+    constexpr uint32_t cb_beta = tt::CBIndex::c_6;
+    constexpr uint32_t cb_x = tt::CBIndex::c_24; // x minus mean
     #if defined RMSNORM and not defined FUSE_PRE_ADD
     constexpr uint32_t cb_xmm = cb_in0; // x minus mean
     #else
-    constexpr uint32_t cb_xmm = tt::CB::c_intermed1; // x minus mean
+    constexpr uint32_t cb_xmm = tt::CBIndex::c_25; // x minus mean
     #endif
-    constexpr uint32_t cb_ex_partial = tt::CB::dataflow0; // E[x] partial reduce
-    constexpr uint32_t cb_ex = tt::CB::dataflow1; // E[x] global reduce
-    constexpr uint32_t cb_ex_external = tt::CB::dataflow2;
-    constexpr uint32_t cb_ex_partial2 = tt::CB::dataflow3; // E[(x-E[x])^2] partial reduce
-    constexpr uint32_t cb_ex2 = tt::CB::dataflow4; // E[(x-E[x])^2] global reduce
-    constexpr uint32_t cb_ex_external2 = tt::CB::dataflow5;
-    constexpr uint32_t cb_ex_global = tt::CB::dataflow7; // E[x] global reduce
+    constexpr uint32_t cb_ex_partial = tt::CBIndex::c_8; // E[x] partial reduce
+    constexpr uint32_t cb_ex = tt::CBIndex::c_9; // E[x] global reduce
+    constexpr uint32_t cb_ex_external = tt::CBIndex::c_10;
+    constexpr uint32_t cb_ex_partial2 = tt::CBIndex::c_11; // E[(x-E[x])^2] partial reduce
+    constexpr uint32_t cb_ex2 = tt::CBIndex::c_12; // E[(x-E[x])^2] global reduce
+    constexpr uint32_t cb_ex_external2 = tt::CBIndex::c_13;
+    constexpr uint32_t cb_ex_global = tt::CBIndex::c_15; // E[x] global reduce
     constexpr uint32_t cb_xmm2 = cb_x; // xmm^2
-    constexpr uint32_t cb_ex2pe = tt::CB::c_intermed3; // E[(x-E[x])^2]+eps
-    constexpr uint32_t cb_fusion = tt::CB::c_intermed1; // stream gamma/beta
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_ex2pe = tt::CBIndex::c_27; // E[(x-E[x])^2]+eps
+    constexpr uint32_t cb_fusion = tt::CBIndex::c_25; // stream gamma/beta
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
 
     binary_op_init_common(cb_in0, cb_in0, cb_x);
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded_post_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded_post_allgather.cpp
index 51771da55a4..a6b31bad1d0 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded_post_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded_post_allgather.cpp
@@ -57,23 +57,23 @@ void MAIN {
     constexpr uint32_t dst1 = 1;
     constexpr uint32_t scaler0 = 0;
 
-    constexpr uint32_t cb_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_in1 = tt::CB::c_in1;
-    constexpr uint32_t cb_eps = tt::CB::c_in3;
-    constexpr uint32_t cb_scaler_global = tt::CB::c_in4;
-    constexpr uint32_t cb_gamma = tt::CB::c_in5;
-    constexpr uint32_t cb_beta = tt::CB::c_in6;
-
-    constexpr uint32_t cb_ex = tt::CB::dataflow1; // E[x] global reduce
-    constexpr uint32_t cb_ex2 = tt::CB::dataflow4; // E[x^2]
-    constexpr uint32_t cb_stats = tt::CB::c_in7; // E[(x-E[x])^2] global reduce
-    constexpr uint32_t cb_stats_reduced = tt::CB::c_intermed4; // E[(x-E[x])^2] global reduce
-    constexpr uint32_t cb_ex_global = tt::CB::dataflow7; // E[x] global reduce
-    constexpr uint32_t cb_reciprocal = tt::CB::c_intermed3; // [E[x^2]-E[x]^2]+eps
-    constexpr uint32_t cb_fusion = tt::CB::c_intermed1; // stream gamma/beta
-    constexpr uint32_t cb_out = tt::CB::c_out0;
-    constexpr uint32_t cb_var = tt::CB::c_intermed2;
-    constexpr uint32_t cb_ex_sqr = tt::CB::c_intermed0; // E[x]^2
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_in1 = tt::CBIndex::c_1;
+    constexpr uint32_t cb_eps = tt::CBIndex::c_3;
+    constexpr uint32_t cb_scaler_global = tt::CBIndex::c_4;
+    constexpr uint32_t cb_gamma = tt::CBIndex::c_5;
+    constexpr uint32_t cb_beta = tt::CBIndex::c_6;
+
+    constexpr uint32_t cb_ex = tt::CBIndex::c_9; // E[x] global reduce
+    constexpr uint32_t cb_ex2 = tt::CBIndex::c_12; // E[x^2]
+    constexpr uint32_t cb_stats = tt::CBIndex::c_7; // E[(x-E[x])^2] global reduce
+    constexpr uint32_t cb_stats_reduced = tt::CBIndex::c_28; // E[(x-E[x])^2] global reduce
+    constexpr uint32_t cb_ex_global = tt::CBIndex::c_15; // E[x] global reduce
+    constexpr uint32_t cb_reciprocal = tt::CBIndex::c_27; // [E[x^2]-E[x]^2]+eps
+    constexpr uint32_t cb_fusion = tt::CBIndex::c_25; // stream gamma/beta
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
+    constexpr uint32_t cb_var = tt::CBIndex::c_26;
+    constexpr uint32_t cb_ex_sqr = tt::CBIndex::c_24; // E[x]^2
 
 
     #ifdef RMSNORM
@@ -83,7 +83,7 @@ void MAIN {
     #else
     binary_op_init_common(cb_stats, cb_scaler_global, cb_stats_reduced);
     constexpr uint32_t stats_tiles = 2;
-    constexpr uint32_t cb_xmm = tt::CB::c_intermed1; // x minus mean
+    constexpr uint32_t cb_xmm = tt::CBIndex::c_25; // x minus mean
     #endif
 
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded_pre_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded_pre_allgather.cpp
index 46a45f83d34..09ab434bf2c 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded_pre_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm_sharded_pre_allgather.cpp
@@ -45,18 +45,18 @@ void MAIN {
     constexpr uint32_t dst1 = 1;
     constexpr uint32_t scaler0 = 0;
 
-    constexpr uint32_t cb_in0 = tt::CB::c_in0;
-    constexpr uint32_t cb_scaler = tt::CB::c_in2;
-    constexpr uint32_t cb_scaler_global = tt::CB::c_in4;
-    constexpr uint32_t cb_x = tt::CB::c_intermed0; // x minus mean
-    constexpr uint32_t cb_ex = tt::CB::dataflow1; // E[x] global reduce
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_scaler = tt::CBIndex::c_2;
+    constexpr uint32_t cb_scaler_global = tt::CBIndex::c_4;
+    constexpr uint32_t cb_x = tt::CBIndex::c_24; // x minus mean
+    constexpr uint32_t cb_ex = tt::CBIndex::c_9; // E[x] global reduce
 
-    constexpr uint32_t cb_ex2 = tt::CB::dataflow4; // E[(x-E[x])^2] global reduce
+    constexpr uint32_t cb_ex2 = tt::CBIndex::c_12; // E[(x-E[x])^2] global reduce
     constexpr uint32_t cb_x2 = cb_x; // x^2
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
 
-    constexpr uint32_t cb_ex_partial2 = tt::CB::dataflow3; // E[x^2] partial reduce
-    constexpr uint32_t cb_ex_external2 = tt::CB::dataflow5; // E[x^2] partials recieved from other cores
+    constexpr uint32_t cb_ex_partial2 = tt::CBIndex::c_11; // E[x^2] partial reduce
+    constexpr uint32_t cb_ex_external2 = tt::CBIndex::c_13; // E[x^2] partials recieved from other cores
     const uint32_t cb_reduction_out = is_second_stage_reader ? cb_out : cb_ex2;
 
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln.cpp
index 016dbddda46..dd42f12e448 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln.cpp
@@ -36,14 +36,14 @@ void kernel_main() {
 
     const uint32_t num_tiles_to_read = is_last_all_to_all_worker ? num_tiles_per_worker_last : num_tiles_per_worker;
 
-    constexpr uint32_t cb_ex_partial = tt::CB::dataflow0; // E[x] partial reduce
-    constexpr uint32_t cb_ex = tt::CB::dataflow1; // E[x] global reduce
-    constexpr uint32_t cb_ex_external = tt::CB::dataflow2;
-    constexpr uint32_t cb_ex_partial2 = tt::CB::dataflow3; // E[(x-E[x])^2] partial reduce
-    constexpr uint32_t cb_ex2 = tt::CB::dataflow4; // E[(x-E[x])^2] global reduce
-    constexpr uint32_t cb_ex_external2 = tt::CB::dataflow5;
-    constexpr uint32_t cb_ex2pe = tt::CB::c_intermed3;
-    constexpr uint32_t cb_ex_global = tt::CB::dataflow7; // E[x] global reduce
+    constexpr uint32_t cb_ex_partial = tt::CBIndex::c_8; // E[x] partial reduce
+    constexpr uint32_t cb_ex = tt::CBIndex::c_9; // E[x] global reduce
+    constexpr uint32_t cb_ex_external = tt::CBIndex::c_10;
+    constexpr uint32_t cb_ex_partial2 = tt::CBIndex::c_11; // E[(x-E[x])^2] partial reduce
+    constexpr uint32_t cb_ex2 = tt::CBIndex::c_12; // E[(x-E[x])^2] global reduce
+    constexpr uint32_t cb_ex_external2 = tt::CBIndex::c_13;
+    constexpr uint32_t cb_ex2pe = tt::CBIndex::c_27;
+    constexpr uint32_t cb_ex_global = tt::CBIndex::c_15; // E[x] global reduce
 
     const uint32_t single_tile_size_bytes = get_tile_size(cb_ex_partial2); // tile size
     const DataFormat data_format = get_dataformat(cb_ex_partial2); // data format
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln_post_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln_post_allgather.cpp
index 8d1aba0c6ee..b1e44b13477 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln_post_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln_post_allgather.cpp
@@ -14,7 +14,7 @@ void kernel_main() {
     constexpr uint32_t block_h                         = get_compile_time_arg_val(3);
 
 
-    constexpr uint32_t cb_ex_global = tt::CB::dataflow7; // [E[x], E[X^2]] global to all cores
+    constexpr uint32_t cb_ex_global = tt::CBIndex::c_15; // [E[x], E[X^2]] global to all cores
 
     #ifdef RMSNORM
     constexpr uint32_t stats_tiles = 1;
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln_pre_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln_pre_allgather.cpp
index a0345a65146..c60c8a69198 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln_pre_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_receiver_unary_sharded_ln_pre_allgather.cpp
@@ -36,9 +36,9 @@ void kernel_main() {
 
     const uint32_t num_tiles_to_read = is_last_all_to_all_worker ? num_tiles_per_worker_last : num_tiles_per_worker;
 
-    constexpr uint32_t cb_ex_partial2 = tt::CB::dataflow3; // E[(x-E[x])^2] partial reduce
-    constexpr uint32_t cb_ex2 = tt::CB::dataflow4; // E[(x-E[x])^2] global reduce
-    constexpr uint32_t cb_ex_external2 = tt::CB::dataflow5;
+    constexpr uint32_t cb_ex_partial2 = tt::CBIndex::c_11; // E[(x-E[x])^2] partial reduce
+    constexpr uint32_t cb_ex2 = tt::CBIndex::c_12; // E[(x-E[x])^2] global reduce
+    constexpr uint32_t cb_ex_external2 = tt::CBIndex::c_13;
 
     const uint32_t single_tile_size_bytes = get_tile_size(cb_ex_partial2); // tile size
     const DataFormat data_format = get_dataformat(cb_ex_partial2); // data format
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln.cpp
index a4816737a5c..389568aa8e1 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln.cpp
@@ -38,14 +38,14 @@ void kernel_main() {
     tt_l1_ptr uint32_t * in0_remote_noc_x          = (tt_l1_ptr uint32_t*)(get_arg_addr(6));
     tt_l1_ptr uint32_t * in0_remote_noc_y          = (tt_l1_ptr uint32_t*)(get_arg_addr(6 + num_x));
 
-    constexpr uint32_t cb_ex_partial = tt::CB::dataflow0;
-    constexpr uint32_t cb_ex = tt::CB::dataflow1;
-    constexpr uint32_t cb_ex_external = tt::CB::dataflow2;
-    constexpr uint32_t cb_ex_partial2 = tt::CB::dataflow3;
-    constexpr uint32_t cb_ex2 = tt::CB::dataflow4;
-    constexpr uint32_t cb_ex_external2 = tt::CB::dataflow5;
-    constexpr uint32_t cb_ex2pe = tt::CB::c_intermed3;
-    constexpr uint32_t cb_ex_global = tt::CB::dataflow7; // E[x] global reduce
+    constexpr uint32_t cb_ex_partial = tt::CBIndex::c_8;
+    constexpr uint32_t cb_ex = tt::CBIndex::c_9;
+    constexpr uint32_t cb_ex_external = tt::CBIndex::c_10;
+    constexpr uint32_t cb_ex_partial2 = tt::CBIndex::c_11;
+    constexpr uint32_t cb_ex2 = tt::CBIndex::c_12;
+    constexpr uint32_t cb_ex_external2 = tt::CBIndex::c_13;
+    constexpr uint32_t cb_ex2pe = tt::CBIndex::c_27;
+    constexpr uint32_t cb_ex_global = tt::CBIndex::c_15; // E[x] global reduce
 
     const uint32_t single_tile_size_bytes = get_tile_size(cb_ex_partial2);
     const DataFormat data_format = get_dataformat(cb_ex_partial2);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln_post_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln_post_allgather.cpp
index 3ad63e4e3f3..d4cf7379f95 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln_post_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln_post_allgather.cpp
@@ -22,8 +22,8 @@ void kernel_main() {
     const uint32_t mcast_dest_noc_end_y                 = get_arg_val<uint32_t>(3);
 
 
-    constexpr uint32_t cb_stats_reduced = tt::CB::c_intermed4; // [E[x], E[x^2]] local to sender
-    constexpr uint32_t cb_ex_global = tt::CB::dataflow7; // [E[x], E[X^2]] global to all cores
+    constexpr uint32_t cb_stats_reduced = tt::CBIndex::c_28; // [E[x], E[x^2]] local to sender
+    constexpr uint32_t cb_ex_global = tt::CBIndex::c_15; // [E[x], E[X^2]] global to all cores
 
     const uint64_t multicast_data_noc = get_noc_multicast_addr(
         mcast_dest_noc_start_x,
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln_pre_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln_pre_allgather.cpp
index 1a962997564..577863ff436 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln_pre_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_ln_pre_allgather.cpp
@@ -38,10 +38,10 @@ void kernel_main() {
     tt_l1_ptr uint32_t * in0_remote_noc_y          = (tt_l1_ptr uint32_t*)(get_arg_addr(6 + num_x));
 
 
-    constexpr uint32_t cb_ex_partial2 = tt::CB::dataflow3;
-    constexpr uint32_t cb_ex2 = tt::CB::dataflow4;
-    constexpr uint32_t cb_ex_external2 = tt::CB::dataflow5;
-    constexpr uint32_t cb_ex2_global = tt::CB::dataflow6; // E[x2] global reduce
+    constexpr uint32_t cb_ex_partial2 = tt::CBIndex::c_11;
+    constexpr uint32_t cb_ex2 = tt::CBIndex::c_12;
+    constexpr uint32_t cb_ex_external2 = tt::CBIndex::c_13;
+    constexpr uint32_t cb_ex2_global = tt::CBIndex::c_14; // E[x2] global reduce
 
     const uint32_t single_tile_size_bytes = get_tile_size(cb_ex_partial2);
     const DataFormat data_format = get_dataformat(cb_ex_partial2);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln.cpp
index db34f9abac8..01565164b00 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln.cpp
@@ -21,17 +21,17 @@ void kernel_main() {
     const uint32_t gamma_tile_start_id            = get_arg_val<uint32_t>(5);
     const uint32_t beta_tile_start_id             = get_arg_val<uint32_t>(6);
 
-    constexpr uint32_t cb_gamma = tt::CB::c_in5;
-    constexpr uint32_t cb_beta = tt::CB::c_in6;
+    constexpr uint32_t cb_gamma = tt::CBIndex::c_5;
+    constexpr uint32_t cb_beta = tt::CBIndex::c_6;
 
 
     {
-        constexpr uint32_t cb_in_2 = tt::CB::c_in2;
+        constexpr uint32_t cb_in_2 = tt::CBIndex::c_2;
         const uint32_t scalar_w = get_arg_val<uint32_t>(1);
         generate_reduce_scaler(cb_in_2, scalar_w);
     }
     if constexpr(is_all_to_all_worker) {
-        constexpr uint32_t cb_in_4 = tt::CB::c_in4;
+        constexpr uint32_t cb_in_4 = tt::CBIndex::c_4;
         const uint32_t scalar_c = get_arg_val<uint32_t>(0);
         generate_reduce_scaler(cb_in_4, scalar_c);
     }
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_pre_all_gather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_pre_all_gather.cpp
index 5a3586d9a17..bf2d07037fd 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_pre_all_gather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_pre_all_gather.cpp
@@ -10,12 +10,12 @@
 
 void kernel_main() {
     constexpr bool is_all_to_all_worker = get_compile_time_arg_val(0) == 1;
-    constexpr uint32_t cb_in_2 = tt::CB::c_in2;
+    constexpr uint32_t cb_in_2 = tt::CBIndex::c_2;
     const uint32_t scalar_w = get_arg_val<uint32_t>(1);
     generate_reduce_scaler(cb_in_2, scalar_w);
 
     if constexpr(is_all_to_all_worker) {
-        constexpr uint32_t cb_in_4 = tt::CB::c_in4;
+        constexpr uint32_t cb_in_4 = tt::CBIndex::c_4;
         const uint32_t scalar_c = get_arg_val<uint32_t>(0);
         generate_reduce_scaler(cb_in_4, scalar_c);
     }
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_rm_gb.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_rm_gb.cpp
index 54748bb01aa..4ff547d3884 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_rm_gb.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_sharded_ln_rm_gb.cpp
@@ -25,19 +25,19 @@ void kernel_main() {
     const uint32_t gamma_tile_start_id            = get_arg_val<uint32_t>(5);
     const uint32_t beta_tile_start_id             = get_arg_val<uint32_t>(6);
 
-    constexpr uint32_t cb_gamma = tt::CB::c_in5;
-    constexpr uint32_t cb_beta = tt::CB::c_in6;
+    constexpr uint32_t cb_gamma = tt::CBIndex::c_5;
+    constexpr uint32_t cb_beta = tt::CBIndex::c_6;
 
     // constexpr uint32_t block_w = 4;
     const uint32_t single_tile_size_bytes = get_tile_size(cb_gamma);
 
     {
-        constexpr uint32_t cb_in_2 = tt::CB::c_in2;
+        constexpr uint32_t cb_in_2 = tt::CBIndex::c_2;
         const uint32_t scalar_w = get_arg_val<uint32_t>(1);
         generate_reduce_scaler(cb_in_2, scalar_w);
     }
     if constexpr(is_all_to_all_worker) {
-        constexpr uint32_t cb_in_4 = tt::CB::c_in4;
+        constexpr uint32_t cb_in_4 = tt::CBIndex::c_4;
         const uint32_t scalar_c = get_arg_val<uint32_t>(0);
         generate_reduce_scaler(cb_in_4, scalar_c);
     }
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
index e693196da9c..c199e908553 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm/device/multi_core/layernorm_op_multi_core.cpp
@@ -267,40 +267,40 @@ operation::ProgramWithCallbacks layernorm_multi_core(
     );
 
     // Create circular buffers
-    CircularBufferConfig cb_src0_config = CircularBufferConfig(in0_t*in_single_tile_size, {{tt::CB::c_in0, in_data_format}}).set_page_size(tt::CB::c_in0, in_single_tile_size);
+    CircularBufferConfig cb_src0_config = CircularBufferConfig(in0_t*in_single_tile_size, {{tt::CBIndex::c_0, in_data_format}}).set_page_size(tt::CBIndex::c_0, in_single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_src0_config );
-    CircularBufferConfig cb_out0_config = CircularBufferConfig(out0_t*out_single_tile_size, {{tt::CB::c_out0, out_data_format}}).set_page_size(tt::CB::c_out0, out_single_tile_size);
+    CircularBufferConfig cb_out0_config = CircularBufferConfig(out0_t*out_single_tile_size, {{tt::CBIndex::c_16, out_data_format}}).set_page_size(tt::CBIndex::c_16, out_single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_out0_config );
     if (!rms_norm) {
-        CircularBufferConfig cb_intermed1_config = CircularBufferConfig(im1_t*single_tile_size, {{tt::CB::c_intermed1, cb_data_format}}).set_page_size(tt::CB::c_intermed1, single_tile_size);
+        CircularBufferConfig cb_intermed1_config = CircularBufferConfig(im1_t*single_tile_size, {{tt::CBIndex::c_25, cb_data_format}}).set_page_size(tt::CBIndex::c_25, single_tile_size);
         CreateCircularBuffer( program, all_cores,  cb_intermed1_config );
     }
-    CircularBufferConfig cb_in2_config = CircularBufferConfig(in2_t*bfloat16_tile_size, {{tt::CB::c_in2, tt::DataFormat::Float16_b}}).set_page_size(tt::CB::c_in2, bfloat16_tile_size);
+    CircularBufferConfig cb_in2_config = CircularBufferConfig(in2_t*bfloat16_tile_size, {{tt::CBIndex::c_2, tt::DataFormat::Float16_b}}).set_page_size(tt::CBIndex::c_2, bfloat16_tile_size);
     CreateCircularBuffer( program, all_cores, cb_in2_config );
-    CircularBufferConfig cb_in3_config = CircularBufferConfig(in3_t*bfloat16_tile_size, {{tt::CB::c_in3, tt::DataFormat::Float16_b}}).set_page_size(tt::CB::c_in3, bfloat16_tile_size);
+    CircularBufferConfig cb_in3_config = CircularBufferConfig(in3_t*bfloat16_tile_size, {{tt::CBIndex::c_3, tt::DataFormat::Float16_b}}).set_page_size(tt::CBIndex::c_3, bfloat16_tile_size);
     CreateCircularBuffer( program, all_cores, cb_in3_config );
-    CircularBufferConfig cb_intermed2_config = CircularBufferConfig(im2_t*single_tile_size, {{tt::CB::c_intermed2, cb_data_format}}).set_page_size(tt::CB::c_intermed2, single_tile_size);
+    CircularBufferConfig cb_intermed2_config = CircularBufferConfig(im2_t*single_tile_size, {{tt::CBIndex::c_26, cb_data_format}}).set_page_size(tt::CBIndex::c_26, single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_intermed2_config );
     if (!(rms_norm && !b.has_value())) {
-        CircularBufferConfig cb_intermed0_config = CircularBufferConfig(im0_t*single_tile_size, {{tt::CB::c_intermed0, cb_data_format}}).set_page_size(tt::CB::c_intermed0, single_tile_size);
+        CircularBufferConfig cb_intermed0_config = CircularBufferConfig(im0_t*single_tile_size, {{tt::CBIndex::c_24, cb_data_format}}).set_page_size(tt::CBIndex::c_24, single_tile_size);
         CreateCircularBuffer( program, all_cores, cb_intermed0_config );
     }
-    CircularBufferConfig c_intermed3_config = CircularBufferConfig(im3_t*single_tile_size, {{tt::CB::c_intermed3, cb_data_format}}).set_page_size(tt::CB::c_intermed3, single_tile_size);
+    CircularBufferConfig c_intermed3_config = CircularBufferConfig(im3_t*single_tile_size, {{tt::CBIndex::c_27, cb_data_format}}).set_page_size(tt::CBIndex::c_27, single_tile_size);
     CreateCircularBuffer( program, all_cores, c_intermed3_config );
-    CircularBufferConfig c_intermed4_config = CircularBufferConfig(im4_t*single_tile_size, {{tt::CB::c_intermed4, cb_data_format}}).set_page_size(tt::CB::c_intermed4, single_tile_size);
+    CircularBufferConfig c_intermed4_config = CircularBufferConfig(im4_t*single_tile_size, {{tt::CBIndex::c_28, cb_data_format}}).set_page_size(tt::CBIndex::c_28, single_tile_size);
     CreateCircularBuffer( program, all_cores, c_intermed4_config );
     if (gamma.has_value() || beta.has_value()) {
-        CircularBufferConfig c_intermed5_config = CircularBufferConfig(im5_t*single_tile_size, {{tt::CB::c_intermed5, cb_data_format}}).set_page_size(tt::CB::c_intermed5, single_tile_size);
+        CircularBufferConfig c_intermed5_config = CircularBufferConfig(im5_t*single_tile_size, {{tt::CBIndex::c_29, cb_data_format}}).set_page_size(tt::CBIndex::c_29, single_tile_size);
         CreateCircularBuffer( program, all_cores, c_intermed5_config );
     }
     if (gamma.has_value()) {
-        CircularBufferConfig c_in5_config = CircularBufferConfig(in5_t * gamma_single_tile_size, {{tt::CB::c_in5, gamma_cb_data_format}})
-            .set_page_size(tt::CB::c_in5, gamma_single_tile_size);
+        CircularBufferConfig c_in5_config = CircularBufferConfig(in5_t * gamma_single_tile_size, {{tt::CBIndex::c_5, gamma_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_5, gamma_single_tile_size);
         CreateCircularBuffer( program, all_cores, c_in5_config );
     }
     if (beta.has_value()) {
-        CircularBufferConfig c_in6_config = CircularBufferConfig(in6_t * beta_single_tile_size, {{tt::CB::c_in6, beta_cb_data_format}})
-            .set_page_size(tt::CB::c_in6, beta_single_tile_size);
+        CircularBufferConfig c_in6_config = CircularBufferConfig(in6_t * beta_single_tile_size, {{tt::CBIndex::c_6, beta_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_6, beta_single_tile_size);
         CreateCircularBuffer( program, all_cores, c_in6_config );
     }
     if (b) {
@@ -309,11 +309,11 @@ operation::ProgramWithCallbacks layernorm_multi_core(
         // if there's no pre-add we use cb_in0 for x, otherwise a is pre-buffered into in0, added into im6, then im6 is used as x
         // b is buffered into c_in1
         if (!rms_norm) {
-            CircularBufferConfig c_intermed6_config = CircularBufferConfig(im6_t*single_tile_size, {{tt::CB::c_intermed6, cb_data_format}}).set_page_size(tt::CB::c_intermed6, single_tile_size);
+            CircularBufferConfig c_intermed6_config = CircularBufferConfig(im6_t*single_tile_size, {{tt::CBIndex::c_30, cb_data_format}}).set_page_size(tt::CBIndex::c_30, single_tile_size);
             CreateCircularBuffer( program, all_cores, c_intermed6_config );
         }
         // c_in1 is input buffer for b
-        CircularBufferConfig c_in1_config = CircularBufferConfig(in1_t*inb_single_tile_size, {{tt::CB::c_in1, inb_data_format}}).set_page_size(tt::CB::c_in1, inb_single_tile_size);
+        CircularBufferConfig c_in1_config = CircularBufferConfig(in1_t*inb_single_tile_size, {{tt::CBIndex::c_1, inb_data_format}}).set_page_size(tt::CBIndex::c_1, inb_single_tile_size);
         CreateCircularBuffer( program, all_cores, c_in1_config);
     }
 
@@ -608,7 +608,7 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
         }
         num_blocks_second_stage = num_cores_all_to_all_second_stage;
     }
-    // change tt::CB external size
+    // change tt::CBIndex external size
     if (use_two_stage_reduce) {
         ex_external_CB_size = (num_blocks_first_stage + num_blocks_second_stage - 1) * single_tile_size;
     }
@@ -1028,12 +1028,12 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
     }
     // Create circular buffers
     // in0 sharded
-    uint32_t in0_cb_index = tt::CB::c_in0;
+    uint32_t in0_cb_index = tt::CBIndex::c_0;
     tt::tt_metal::CircularBufferConfig in0_cb_config = tt::tt_metal::CircularBufferConfig(in0_CB_size, {{in0_cb_index, in_data_format}})
 		.set_page_size(in0_cb_index, in_single_tile_size).set_globally_allocated_address(*a.buffer());
     auto cb_in0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in0_cb_config);
     // in1 sharded
-    uint32_t in1_cb_index = tt::CB::c_in1;
+    uint32_t in1_cb_index = tt::CBIndex::c_1;
     CBHandle cb_in1 = 0;
     if (b) {
         tt::tt_metal::CircularBufferConfig in1_cb_config = tt::tt_metal::CircularBufferConfig(in1_CB_size, {{in1_cb_index, in_data_format}})
@@ -1041,86 +1041,86 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
         cb_in1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in1_cb_config);
     }
     // in2 scaler
-    uint32_t in2_cb_index = tt::CB::c_in2;
+    uint32_t in2_cb_index = tt::CBIndex::c_2;
     tt::tt_metal::CircularBufferConfig in2_cb_config = tt::tt_metal::CircularBufferConfig(in2_CB_size, {{in2_cb_index, tt::DataFormat::Float16_b}})
 		.set_page_size(in2_cb_index, bfloat16_tile_size);
     auto cb_in2 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in2_cb_config);
     // in4 scaler-c
-    uint32_t in4_cb_index = tt::CB::c_in4;
+    uint32_t in4_cb_index = tt::CBIndex::c_4;
     tt::tt_metal::CircularBufferConfig in4_cb_config = tt::tt_metal::CircularBufferConfig(in2_CB_size, {{in4_cb_index, tt::DataFormat::Float16_b}})
 		.set_page_size(in4_cb_index, bfloat16_tile_size);
     auto cb_in4 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in4_cb_config);
     // in3 eps
-    uint32_t in3_cb_index = tt::CB::c_in3;
+    uint32_t in3_cb_index = tt::CBIndex::c_3;
     tt::tt_metal::CircularBufferConfig in3_cb_config = tt::tt_metal::CircularBufferConfig(in3_CB_size, {{in3_cb_index, tt::DataFormat::Float16_b}})
 		.set_page_size(in3_cb_index, bfloat16_tile_size);
     auto cb_in3 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in3_cb_config);
     // gamma
     if (gamma.has_value()) {
-        uint32_t in5_cb_index = tt::CB::c_in5;
+        uint32_t in5_cb_index = tt::CBIndex::c_5;
         tt::tt_metal::CircularBufferConfig in5_cb_config = tt::tt_metal::CircularBufferConfig(in5_CB_size, {{in5_cb_index, gamma_cb_data_format}})
             .set_page_size(in5_cb_index, gamma_single_tile_size);
         auto cb_in5 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in5_cb_config);
     }
     // beta
     if (beta.has_value()) {
-        uint32_t in6_cb_index = tt::CB::c_in6;
+        uint32_t in6_cb_index = tt::CBIndex::c_6;
         tt::tt_metal::CircularBufferConfig in6_cb_config = tt::tt_metal::CircularBufferConfig(in6_CB_size, {{in6_cb_index, beta_cb_data_format}})
             .set_page_size(in6_cb_index, beta_single_tile_size);
         auto cb_in6 = tt::tt_metal::CreateCircularBuffer(program, all_cores, in6_cb_config);
     }
     // x
     uint32_t x_cb_index;
-    x_cb_index = tt::CB::c_intermed0;
+    x_cb_index = tt::CBIndex::c_24;
     tt::tt_metal::CircularBufferConfig x_cb_config = tt::tt_metal::CircularBufferConfig(x_CB_size, {{x_cb_index, cb_data_format}})
         .set_page_size(x_cb_index, single_tile_size);
     auto cb_x = tt::tt_metal::CreateCircularBuffer(program, all_cores, x_cb_config);
     // xmm
     uint32_t xmm_cb_index;
-    xmm_cb_index = tt::CB::c_intermed1;
+    xmm_cb_index = tt::CBIndex::c_25;
     tt::tt_metal::CircularBufferConfig xmm_cb_config = tt::tt_metal::CircularBufferConfig(xmm_CB_size, {{xmm_cb_index, cb_data_format}})
         .set_page_size(xmm_cb_index, single_tile_size);
     auto cb_xmm = tt::tt_metal::CreateCircularBuffer(program, all_cores, xmm_cb_config);
     // ex_partial
     if(!rms_norm) {
-        uint32_t ex_cb_partial_index = tt::CB::dataflow0;
+        uint32_t ex_cb_partial_index = tt::CBIndex::c_8;
         tt::tt_metal::CircularBufferConfig ex_cb_partial_config = tt::tt_metal::CircularBufferConfig(ex_partial_CB_size, {{ex_cb_partial_index, cb_data_format}})
             .set_page_size(ex_cb_partial_index, single_tile_size);
         auto cb_ex_partial = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_cb_partial_config);
         // ex
-        uint32_t ex_cb_index = tt::CB::dataflow1;
+        uint32_t ex_cb_index = tt::CBIndex::c_9;
         tt::tt_metal::CircularBufferConfig ex_cb_config = tt::tt_metal::CircularBufferConfig(ex_CB_size, {{ex_cb_index, cb_data_format}})
             .set_page_size(ex_cb_index, single_tile_size);
         auto cb_ex = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_cb_config);
         // ex_external
-        uint32_t ex_cb_external_index = tt::CB::dataflow2;
+        uint32_t ex_cb_external_index = tt::CBIndex::c_10;
         tt::tt_metal::CircularBufferConfig ex_cb_external_config = tt::tt_metal::CircularBufferConfig(ex_external_CB_size, {{ex_cb_external_index, cb_data_format}})
             .set_page_size(ex_cb_external_index, single_tile_size);
         auto cb_ex_external = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_cb_external_config);
     }
     // ex_partial2
-    uint32_t ex_cb_partial2_index = tt::CB::dataflow3;
+    uint32_t ex_cb_partial2_index = tt::CBIndex::c_11;
     tt::tt_metal::CircularBufferConfig ex_cb_partial2_config = tt::tt_metal::CircularBufferConfig(ex_partial_CB_size, {{ex_cb_partial2_index, cb_data_format}})
 		.set_page_size(ex_cb_partial2_index, single_tile_size);
     auto cb_ex_partial2 = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_cb_partial2_config);
     // ex2
-    uint32_t ex2_cb_index = tt::CB::dataflow4;
+    uint32_t ex2_cb_index = tt::CBIndex::c_12;
     tt::tt_metal::CircularBufferConfig ex2_cb_config = tt::tt_metal::CircularBufferConfig(ex_CB_size, {{ex2_cb_index, cb_data_format}})
 		.set_page_size(ex2_cb_index, single_tile_size);
     auto cb_ex2 = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex2_cb_config);
     // ex_external2
-    uint32_t ex_cb_external2_index = tt::CB::dataflow5;
+    uint32_t ex_cb_external2_index = tt::CBIndex::c_13;
     tt::tt_metal::CircularBufferConfig ex_cb_external2_config = tt::tt_metal::CircularBufferConfig(ex_external_CB_size, {{ex_cb_external2_index, cb_data_format}})
 		.set_page_size(ex_cb_external2_index, single_tile_size);
     auto cb_ex_external2 = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_cb_external2_config);
     // ex_global
-    uint32_t ex_global_cb_index = tt::CB::dataflow7;
+    uint32_t ex_global_cb_index = tt::CBIndex::c_15;
     tt::tt_metal::CircularBufferConfig ex_global_cb_config = tt::tt_metal::CircularBufferConfig(ex_global_CB_size, {{ex_global_cb_index, cb_data_format}})
 		.set_page_size(ex_global_cb_index, single_tile_size);
     auto cb_ex_global = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex_global_cb_config);
     // ex2pe
     uint32_t cb_ex2pe_index;
-    cb_ex2pe_index = tt::CB::c_intermed3;
+    cb_ex2pe_index = tt::CBIndex::c_27;
     tt::tt_metal::CircularBufferConfig ex2pe_cb_config = tt::tt_metal::CircularBufferConfig(ex2pe_CB_size, {{cb_ex2pe_index, cb_data_format}})
         .set_page_size(cb_ex2pe_index, single_tile_size);
     auto cb_ex2pe = tt::tt_metal::CreateCircularBuffer(program, all_cores, ex2pe_cb_config);
@@ -1129,25 +1129,25 @@ operation::ProgramWithCallbacks layernorm_multi_core_sharded(
     if (is_post_all_gather){
         // cb_stats
         uint32_t cb_stats_index;
-        cb_stats_index = tt::CB::c_in7;
+        cb_stats_index = tt::CBIndex::c_7;
         tt::tt_metal::CircularBufferConfig stats_cb_config = tt::tt_metal::CircularBufferConfig(stats_cb_size, {{cb_stats_index, cb_data_format}})
             .set_page_size(cb_stats_index, single_tile_size).set_globally_allocated_address(*stats.value().buffer());
         cb_stats = tt::tt_metal::CreateCircularBuffer(program, sender_cores, stats_cb_config);
         // cb_stats_reduced
         uint32_t cb_stats_reduced_index;
-        cb_stats_reduced_index = tt::CB::c_intermed4;
+        cb_stats_reduced_index = tt::CBIndex::c_28;
         tt::tt_metal::CircularBufferConfig stats_reduced_cb_config = tt::tt_metal::CircularBufferConfig(stats_reduced_cb_size, {{cb_stats_reduced_index, cb_data_format}})
             .set_page_size(cb_stats_reduced_index, single_tile_size);
         auto cb_stats_reduced = tt::tt_metal::CreateCircularBuffer(program, sender_cores, stats_reduced_cb_config);
 
             // cb_var
-        uint32_t cb_var_index = tt::CB::c_intermed2;
+        uint32_t cb_var_index = tt::CBIndex::c_26;
         tt::tt_metal::CircularBufferConfig cb_var_config = tt::tt_metal::CircularBufferConfig(ex_global_CB_size, {{cb_var_index, cb_data_format}})
             .set_page_size(cb_var_index, single_tile_size);
         auto cb_var_global = tt::tt_metal::CreateCircularBuffer(program, sender_cores, cb_var_config);
     }
     // out
-    uint32_t output_cb_index = tt::CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig output_cb_config = tt::tt_metal::CircularBufferConfig(out_CB_size, {{output_cb_index, out_data_format}})
 		.set_page_size(output_cb_index, out_single_tile_size).set_globally_allocated_address(*output.buffer());
     CBHandle cb_output = 0;
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/compute/layernorm_post_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/compute/layernorm_post_allgather.cpp
index 945e32ce1e7..48c2ae8763c 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/compute/layernorm_post_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/compute/layernorm_post_allgather.cpp
@@ -40,24 +40,24 @@ void MAIN {
 
     constexpr uint32_t onetile = 1;
 
-    constexpr uint32_t cb_inp = tt::CB::c_in0;
-    constexpr uint32_t cb_stats = tt::CB::c_in1;
+    constexpr uint32_t cb_inp = tt::CBIndex::c_0;
+    constexpr uint32_t cb_stats = tt::CBIndex::c_1;
 
-    constexpr uint32_t cb_eps = tt::CB::c_in4;
-    constexpr uint32_t cb_reduce = tt::CB::c_in5;
+    constexpr uint32_t cb_eps = tt::CBIndex::c_4;
+    constexpr uint32_t cb_reduce = tt::CBIndex::c_5;
 
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
 
-    constexpr uint32_t cb_stats_reduced = tt::CB::c_intermed0; // [E(x**2), E(x)]
-    constexpr uint32_t cb_var_eps = tt::CB::c_intermed3; // var + epsilon (or E(x**2) + epsilon)
-    constexpr uint32_t cb_recip_sqrt_var = tt::CB::c_intermed4; // 1/sqrt(var+eps)
-    constexpr uint32_t cb_x_normed = tt::CB::c_intermed6; // (x - E(x)) * 1/sqrt(var+eps) or x * 1/sqrt(E(x**2) + eps)
+    constexpr uint32_t cb_stats_reduced = tt::CBIndex::c_24; // [E(x**2), E(x)]
+    constexpr uint32_t cb_var_eps = tt::CBIndex::c_27; // var + epsilon (or E(x**2) + epsilon)
+    constexpr uint32_t cb_recip_sqrt_var = tt::CBIndex::c_28; // 1/sqrt(var+eps)
+    constexpr uint32_t cb_x_normed = tt::CBIndex::c_30; // (x - E(x)) * 1/sqrt(var+eps) or x * 1/sqrt(E(x**2) + eps)
 
-    constexpr uint32_t cb_var = tt::CB::c_intermed2; // E(x**2) - E(x)**2 or E(x**2)
+    constexpr uint32_t cb_var = tt::CBIndex::c_26; // E(x**2) - E(x)**2 or E(x**2)
     #ifndef RMSNORM
     // Layernorm-specific CBs
-    constexpr uint32_t cb_mean_squared = tt::CB::c_intermed1; // E(x)**2
-    constexpr uint32_t cb_x_minus_mean = tt::CB::c_intermed5; // x - E(x)
+    constexpr uint32_t cb_mean_squared = tt::CBIndex::c_25; // E(x)**2
+    constexpr uint32_t cb_x_minus_mean = tt::CBIndex::c_29; // x - E(x)
 
     constexpr uint32_t cb_norm_x_input = cb_x_minus_mean;
     constexpr uint32_t stats_tile_stride = 2;
@@ -66,11 +66,11 @@ void MAIN {
     constexpr uint32_t stats_tile_stride = 1;
     #endif
 
-    constexpr uint32_t cb_gamma = tt::CB::c_in2;
-    constexpr uint32_t cb_beta = tt::CB::c_in3;
+    constexpr uint32_t cb_gamma = tt::CBIndex::c_2;
+    constexpr uint32_t cb_beta = tt::CBIndex::c_3;
     uint32_t cb_times_gamma_out = cb_out;
     if constexpr(do_gamma and do_beta) {
-        cb_times_gamma_out = tt::CB::c_intermed7;
+        cb_times_gamma_out = tt::CBIndex::c_31;
     }
 
     binary_op_init_common(cb_inp, cb_inp, cb_stats_reduced);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/compute/layernorm_pre_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/compute/layernorm_pre_allgather.cpp
index 935b54affc8..c1546897d86 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/compute/layernorm_pre_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/compute/layernorm_pre_allgather.cpp
@@ -31,12 +31,12 @@ void MAIN {
 
     constexpr uint32_t onetile = 1;
 
-    constexpr uint32_t cb_inp = tt::CB::c_in0;
-    constexpr uint32_t cb_reduce = tt::CB::c_in1;
+    constexpr uint32_t cb_inp = tt::CBIndex::c_0;
+    constexpr uint32_t cb_reduce = tt::CBIndex::c_1;
 
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
 
-    constexpr uint32_t cb_x2 = tt::CB::c_intermed0; // x**2
+    constexpr uint32_t cb_x2 = tt::CBIndex::c_24; // x**2
 
     cb_wait_front(cb_reduce, 1); // comes from the reader
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_post_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_post_allgather.cpp
index 035254bd5a4..28669907e9f 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_post_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_post_allgather.cpp
@@ -23,12 +23,12 @@ void kernel_main() {
     const uint32_t beta_addr = get_arg_val<uint32_t>(8);
     const uint32_t stats_addr = get_arg_val<uint32_t>(9);
 
-    constexpr uint32_t cb_inp = tt::CB::c_in0;
-    constexpr uint32_t cb_stats = tt::CB::c_in1;
-    constexpr uint32_t cb_gamma = tt::CB::c_in2;
-    constexpr uint32_t cb_beta = tt::CB::c_in3;
-    constexpr uint32_t cb_eps = tt::CB::c_in4;
-    constexpr uint32_t cb_reduce = tt::CB::c_in5;
+    constexpr uint32_t cb_inp = tt::CBIndex::c_0;
+    constexpr uint32_t cb_stats = tt::CBIndex::c_1;
+    constexpr uint32_t cb_gamma = tt::CBIndex::c_2;
+    constexpr uint32_t cb_beta = tt::CBIndex::c_3;
+    constexpr uint32_t cb_eps = tt::CBIndex::c_4;
+    constexpr uint32_t cb_reduce = tt::CBIndex::c_5;
 
     // ublocks size defined in tiles
     const uint32_t src0_tile_bytes = get_tile_size(cb_inp);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_pre_allgather.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_pre_allgather.cpp
index 13a7b1a855a..6cfa04ce8cf 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_pre_allgather.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/reader_unary_interleaved_ln_rm_gb_pre_allgather.cpp
@@ -18,8 +18,8 @@ void kernel_main() {
     const uint32_t Wt        = get_arg_val<uint32_t>(2);  // Width in tiles
     const uint32_t tile_offset = get_arg_val<uint32_t>(3);  // Tile offset for this core
 
-    constexpr uint32_t cb_inp = tt::CB::c_in0;
-    constexpr uint32_t cb_reduce = tt::CB::c_in1;
+    constexpr uint32_t cb_inp = tt::CBIndex::c_0;
+    constexpr uint32_t cb_reduce = tt::CBIndex::c_1;
 
     // ublocks size defined in tiles
     const uint32_t src0_tile_bytes = get_tile_size(cb_inp);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp
index cd8cbe0a184..72337d0bfb2 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp
@@ -16,7 +16,7 @@ void kernel_main() {
     constexpr bool dst_is_dram = get_compile_time_arg_val(0) == 1;
     constexpr uint32_t blk = get_compile_time_arg_val(1); // needed for correctness of softmax/LN kernels
 
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
     constexpr uint32_t onetile = 1;
     const uint32_t tile_bytes = get_tile_size(cb_out);
     const DataFormat data_format = get_dataformat(cb_out);
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
index 49d190c659d..7df0db77878 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_post_all_gather_op_multi_core.cpp
@@ -286,64 +286,64 @@ operation::ProgramWithCallbacks layernorm_post_allgather_multi_core(
 
     // Create circular buffers
     // c_in0 -> a
-    CircularBufferConfig cb_src0_config = CircularBufferConfig(in0_tiles*in_single_tile_size, {{tt::CB::c_in0, in_data_format}}).set_page_size(tt::CB::c_in0, in_single_tile_size);
+    CircularBufferConfig cb_src0_config = CircularBufferConfig(in0_tiles*in_single_tile_size, {{tt::CBIndex::c_0, in_data_format}}).set_page_size(tt::CBIndex::c_0, in_single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_src0_config );
     // c_in1 -> stats
-    CircularBufferConfig cb_stats_config = CircularBufferConfig(in1_tiles*stats_single_tile_size, {{tt::CB::c_in1, stats_data_format}}).set_page_size(tt::CB::c_in1, stats_single_tile_size);
+    CircularBufferConfig cb_stats_config = CircularBufferConfig(in1_tiles*stats_single_tile_size, {{tt::CBIndex::c_1, stats_data_format}}).set_page_size(tt::CBIndex::c_1, stats_single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_stats_config );
     // c_in2 -> gamma
     if (gamma.has_value()) {
-        CircularBufferConfig cb_gamma_config = CircularBufferConfig(in2_tiles*gamma_single_tile_size, {{tt::CB::c_in2, gamma_cb_data_format}}).set_page_size(tt::CB::c_in2, gamma_single_tile_size);
+        CircularBufferConfig cb_gamma_config = CircularBufferConfig(in2_tiles*gamma_single_tile_size, {{tt::CBIndex::c_2, gamma_cb_data_format}}).set_page_size(tt::CBIndex::c_2, gamma_single_tile_size);
         CreateCircularBuffer( program, all_cores, cb_gamma_config );
     }
     // c_in3 -> beta
     if (beta.has_value()) {
-        CircularBufferConfig cb_beta_config = CircularBufferConfig(in3_tiles*beta_single_tile_size, {{tt::CB::c_in3, beta_cb_data_format}}).set_page_size(tt::CB::c_in3, beta_single_tile_size);
+        CircularBufferConfig cb_beta_config = CircularBufferConfig(in3_tiles*beta_single_tile_size, {{tt::CBIndex::c_3, beta_cb_data_format}}).set_page_size(tt::CBIndex::c_3, beta_single_tile_size);
         CreateCircularBuffer( program, all_cores, cb_beta_config );
     }
     // c_in4 -> epsilon
-    CircularBufferConfig cb_eps_config = CircularBufferConfig(in4_tiles*bfloat16_tile_size, {{tt::CB::c_in4, tt::DataFormat::Float16_b}}).set_page_size(tt::CB::c_in4, bfloat16_tile_size);
+    CircularBufferConfig cb_eps_config = CircularBufferConfig(in4_tiles*bfloat16_tile_size, {{tt::CBIndex::c_4, tt::DataFormat::Float16_b}}).set_page_size(tt::CBIndex::c_4, bfloat16_tile_size);
     CreateCircularBuffer( program, all_cores, cb_eps_config );
     // c_in5 -> reduce scalar
-    CircularBufferConfig cb_reduce_config = CircularBufferConfig(in5_tiles*bfloat16_tile_size, {{tt::CB::c_in5, tt::DataFormat::Float16_b}}).set_page_size(tt::CB::c_in5, bfloat16_tile_size);
+    CircularBufferConfig cb_reduce_config = CircularBufferConfig(in5_tiles*bfloat16_tile_size, {{tt::CBIndex::c_5, tt::DataFormat::Float16_b}}).set_page_size(tt::CBIndex::c_5, bfloat16_tile_size);
     CreateCircularBuffer( program, all_cores, cb_reduce_config );
 
     // LN and RMS shared intermediates //
     // c_intermed0 -> [mean(x**2), mean(x)]
-    CircularBufferConfig cb_intermed0_config = CircularBufferConfig(intermed0_tiles*single_tile_size, {{tt::CB::c_intermed0, cb_data_format}}).set_page_size(tt::CB::c_intermed0, single_tile_size);
+    CircularBufferConfig cb_intermed0_config = CircularBufferConfig(intermed0_tiles*single_tile_size, {{tt::CBIndex::c_24, cb_data_format}}).set_page_size(tt::CBIndex::c_24, single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_intermed0_config );
     // c_intermed2 -> var = mean(x**2) - mean(x)**2
-    CircularBufferConfig cb_intermed2_config = CircularBufferConfig(intermed2_tiles*single_tile_size, {{tt::CB::c_intermed2, cb_data_format}}).set_page_size(tt::CB::c_intermed2, single_tile_size);
+    CircularBufferConfig cb_intermed2_config = CircularBufferConfig(intermed2_tiles*single_tile_size, {{tt::CBIndex::c_26, cb_data_format}}).set_page_size(tt::CBIndex::c_26, single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_intermed2_config );
     // c_intermed3 -> var + epsilon
-    CircularBufferConfig cb_intermed3_config = CircularBufferConfig(intermed3_tiles*single_tile_size, {{tt::CB::c_intermed3, cb_data_format}}).set_page_size(tt::CB::c_intermed3, single_tile_size);
+    CircularBufferConfig cb_intermed3_config = CircularBufferConfig(intermed3_tiles*single_tile_size, {{tt::CBIndex::c_27, cb_data_format}}).set_page_size(tt::CBIndex::c_27, single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_intermed3_config );
     // c_intermed4 -> 1/sqrt(var + epsilon)
-    CircularBufferConfig cb_intermed4_config = CircularBufferConfig(intermed4_tiles*single_tile_size, {{tt::CB::c_intermed4, cb_data_format}}).set_page_size(tt::CB::c_intermed4, single_tile_size);
+    CircularBufferConfig cb_intermed4_config = CircularBufferConfig(intermed4_tiles*single_tile_size, {{tt::CBIndex::c_28, cb_data_format}}).set_page_size(tt::CBIndex::c_28, single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_intermed4_config );
     // c_intermed6 -> (x - mean(x)) * 1/sqrt(var + epsilon)
-    CircularBufferConfig cb_intermed6_config = CircularBufferConfig(intermed6_tiles*single_tile_size, {{tt::CB::c_intermed6, cb_data_format}}).set_page_size(tt::CB::c_intermed6, single_tile_size);
+    CircularBufferConfig cb_intermed6_config = CircularBufferConfig(intermed6_tiles*single_tile_size, {{tt::CBIndex::c_30, cb_data_format}}).set_page_size(tt::CBIndex::c_30, single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_intermed6_config );
 
 
     // LN-specific intermediates
     if (!is_rmsnorm) {
         // c_intermed1 -> mean(x)**2
-        CircularBufferConfig cb_intermed1_config = CircularBufferConfig(intermed1_tiles*single_tile_size, {{tt::CB::c_intermed1, cb_data_format}}).set_page_size(tt::CB::c_intermed1, single_tile_size);
+        CircularBufferConfig cb_intermed1_config = CircularBufferConfig(intermed1_tiles*single_tile_size, {{tt::CBIndex::c_25, cb_data_format}}).set_page_size(tt::CBIndex::c_25, single_tile_size);
         CreateCircularBuffer( program, all_cores, cb_intermed1_config );
         // c_intermed5 -> x - mean(x)
-        CircularBufferConfig cb_intermed5_config = CircularBufferConfig(intermed5_tiles*single_tile_size, {{tt::CB::c_intermed5, cb_data_format}}).set_page_size(tt::CB::c_intermed5, single_tile_size);
+        CircularBufferConfig cb_intermed5_config = CircularBufferConfig(intermed5_tiles*single_tile_size, {{tt::CBIndex::c_29, cb_data_format}}).set_page_size(tt::CBIndex::c_29, single_tile_size);
         CreateCircularBuffer( program, all_cores, cb_intermed5_config );
         if (beta.has_value()) {
             // Layernorm has gamma and beta so we need an extra intermediate buffer
             // c_intermed7 -> (x - mean(x)) * 1/sqrt(var + epsilon) * gamma
-            CircularBufferConfig cb_intermed7_config = CircularBufferConfig(intermed7_tiles*single_tile_size, {{tt::CB::c_intermed7, cb_data_format}}).set_page_size(tt::CB::c_intermed7, single_tile_size);
+            CircularBufferConfig cb_intermed7_config = CircularBufferConfig(intermed7_tiles*single_tile_size, {{tt::CBIndex::c_31, cb_data_format}}).set_page_size(tt::CBIndex::c_31, single_tile_size);
             CreateCircularBuffer( program, all_cores, cb_intermed7_config );
         }
     }
 
 
-    CircularBufferConfig cb_out0_config = CircularBufferConfig(out0_tiles*out_single_tile_size, {{tt::CB::c_out0, out_data_format}}).set_page_size(tt::CB::c_out0, out_single_tile_size);
+    CircularBufferConfig cb_out0_config = CircularBufferConfig(out0_tiles*out_single_tile_size, {{tt::CBIndex::c_16, out_data_format}}).set_page_size(tt::CBIndex::c_16, out_single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_out0_config );
 
     // Log all circular buffers with program.circular_buffers_on_corerange(all_cores), which returns std::vector<std::shared_ptr<CircularBuffer>>
diff --git a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
index 2cb5bff6595..90b3a6c3c54 100644
--- a/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed/device/multi_core/layernorm_pre_all_gather_op_multi_core.cpp
@@ -199,18 +199,18 @@ operation::ProgramWithCallbacks layernorm_pre_allgather_multi_core(
 
     // Create circular buffers
     // c_in0 -> a
-    CircularBufferConfig cb_src0_config = CircularBufferConfig(in0_tiles*in_single_tile_size, {{tt::CB::c_in0, in_data_format}}).set_page_size(tt::CB::c_in0, in_single_tile_size);
+    CircularBufferConfig cb_src0_config = CircularBufferConfig(in0_tiles*in_single_tile_size, {{tt::CBIndex::c_0, in_data_format}}).set_page_size(tt::CBIndex::c_0, in_single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_src0_config );
     // c_in1 -> reduce scalar
-    CircularBufferConfig cb_reduce_config = CircularBufferConfig(in1_tiles*bfloat16_tile_size, {{tt::CB::c_in1, cb_data_format}}).set_page_size(tt::CB::c_in1, bfloat16_tile_size);
+    CircularBufferConfig cb_reduce_config = CircularBufferConfig(in1_tiles*bfloat16_tile_size, {{tt::CBIndex::c_1, cb_data_format}}).set_page_size(tt::CBIndex::c_1, bfloat16_tile_size);
     CreateCircularBuffer( program, all_cores, cb_reduce_config );
 
     // LN and RMS shared intermediates //
     // c_intermed0 -> xˆ2
-    CircularBufferConfig cb_intermed0_config = CircularBufferConfig(intermed0_tiles*single_tile_size, {{tt::CB::c_intermed0, cb_data_format}}).set_page_size(tt::CB::c_intermed0, single_tile_size);
+    CircularBufferConfig cb_intermed0_config = CircularBufferConfig(intermed0_tiles*single_tile_size, {{tt::CBIndex::c_24, cb_data_format}}).set_page_size(tt::CBIndex::c_24, single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_intermed0_config );
 
-    CircularBufferConfig cb_out0_config = CircularBufferConfig(out0_tiles*out_single_tile_size, {{tt::CB::c_out0, out_data_format}}).set_page_size(tt::CB::c_out0, out_single_tile_size);
+    CircularBufferConfig cb_out0_config = CircularBufferConfig(out0_tiles*out_single_tile_size, {{tt::CBIndex::c_16, out_data_format}}).set_page_size(tt::CBIndex::c_16, out_single_tile_size);
     CreateCircularBuffer( program, all_cores, cb_out0_config );
 
     // Log all circular buffers with program.circular_buffers_on_corerange(all_cores), which returns std::vector<std::shared_ptr<CircularBuffer>>
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp
index 59b6e492192..213359f9746 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp
@@ -75,25 +75,25 @@ void MAIN {
     const uint32_t ndst = get_arg_val<uint32_t>(3);
     const uint32_t start_ht = get_arg_val<uint32_t>(4);
     const uint32_t mask_padded_data = get_arg_val<uint32_t>(5);
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in2, tt::CB::c_intermed0);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_2, tt::CBIndex::c_24);
 
     constexpr uint32_t onetile = 1;
     // reserve one tile for zeros on cb_in2
     // We only do the reserve for the intermediates once and use pack_tile
     // So effectively these are used as pre-allocated arrays
     // Note that the entire W dimension must fit in the intermed0 CB for this kernel to be correct
-    constexpr auto cb_bcast_scaler = tt::CB::c_in2;
-    constexpr auto cb_fused_scale = tt::CB::c_in3;
-    constexpr auto cb_fused_attn = tt::CB::c_in4;
-    constexpr auto cb_mask_padded = tt::CB::c_in5;
-    constexpr auto cb_exps = tt::CB::c_intermed0;
-    constexpr auto cb_scale_mask = tt::CB::c_intermed3;
-    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_2;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_3;
+    constexpr auto cb_fused_attn = tt::CBIndex::c_4;
+    constexpr auto cb_mask_padded = tt::CBIndex::c_5;
+    constexpr auto cb_exps = tt::CBIndex::c_24;
+    constexpr auto cb_scale_mask = tt::CBIndex::c_27;
+    constexpr auto cb_recipsumexps = tt::CBIndex::c_25;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     #ifdef NUMERIC_STABLE
-        constexpr auto cb_max = tt::CB::c_intermed2;
-        constexpr auto cb_x = tt::CB::c_intermed4;
+        constexpr auto cb_max = tt::CBIndex::c_26;
+        constexpr auto cb_x = tt::CBIndex::c_28;
     #else
         constexpr auto cb_x = cb_exps;
     #endif
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax_sharded.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax_sharded.cpp
index 3bbee84224c..dcd49c623c1 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax_sharded.cpp
@@ -71,19 +71,19 @@ void MAIN {
     constexpr uint32_t subblock_w                     = get_compile_time_arg_val(2);
     constexpr uint32_t num_subblocks_w                = get_compile_time_arg_val(3);
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1, tt::CB::c_intermed0);
-
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_bcast_scaler = tt::CB::c_in1;
-    constexpr auto cb_fused_scale = tt::CB::c_in2;
-    constexpr auto cb_fused_attn = tt::CB::c_in3;
-    constexpr auto cb_exps = tt::CB::c_intermed0;
-    constexpr auto cb_recipsumexps = tt::CB::c_intermed1;
-    constexpr auto cb_scale_mask = tt::CB::c_intermed2;
-    constexpr auto cb_out0 = tt::CB::c_out0;
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_24);
+
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_bcast_scaler = tt::CBIndex::c_1;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_2;
+    constexpr auto cb_fused_attn = tt::CBIndex::c_3;
+    constexpr auto cb_exps = tt::CBIndex::c_24;
+    constexpr auto cb_recipsumexps = tt::CBIndex::c_25;
+    constexpr auto cb_scale_mask = tt::CBIndex::c_26;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
     #ifdef NUMERIC_STABLE
-        constexpr auto cb_max = tt::CB::c_intermed3;
-        constexpr auto cb_x = tt::CB::c_intermed4;
+        constexpr auto cb_max = tt::CBIndex::c_27;
+        constexpr auto cb_x = tt::CBIndex::c_28;
     #else
         constexpr auto cb_x = cb_exps;
     #endif
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/readed_unary_sharded_sm_causal_mask_hw_dims.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/readed_unary_sharded_sm_causal_mask_hw_dims.cpp
index cd46048c55e..0f4288f5e55 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/readed_unary_sharded_sm_causal_mask_hw_dims.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/readed_unary_sharded_sm_causal_mask_hw_dims.cpp
@@ -8,7 +8,7 @@
 
 // HW-bcast scale for fused scale-attn-softmax
 FORCE_INLINE void generate_inv_sqrt_hw_bcast_tile() {
-    constexpr auto cb_fused_scale = tt::CB::c_in2;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_2;
     uint32_t u = get_arg_val<uint32_t>(1);
     cb_reserve_back(cb_fused_scale, 1);
     auto ptr = reinterpret_cast<uint16_t *>(get_write_ptr(cb_fused_scale));
@@ -17,7 +17,7 @@ FORCE_INLINE void generate_inv_sqrt_hw_bcast_tile() {
 }
 
 void kernel_main() {
-    constexpr uint32_t cb_reduce_scaler = tt::CB::c_in1;
+    constexpr uint32_t cb_reduce_scaler = tt::CBIndex::c_1;
     const uint32_t reduce_scaler = get_arg_val<uint32_t>(0);
 
     constexpr uint32_t block_wt = get_compile_time_arg_val(0);
@@ -27,7 +27,7 @@ void kernel_main() {
     const uint32_t mask_start_tile_id = get_arg_val<uint32_t>(3);
     uint32_t mask_num_tiles = get_arg_val<uint32_t>(4);
 
-    constexpr uint32_t cb_attn = tt::CB::c_in3;
+    constexpr uint32_t cb_attn = tt::CBIndex::c_3;
     uint32_t mask_tile_bytes = get_tile_size(cb_attn);
     const DataFormat mask_data_format = get_dataformat(cb_attn);
     uint32_t mask_id = mask_start_tile_id;
@@ -35,7 +35,7 @@ void kernel_main() {
     const InterleavedAddrGenFast<is_dram_mask> addr_mask = {
         .bank_base_address = mask_addr, .page_size = mask_tile_bytes, .data_format = mask_data_format};
 
-    constexpr auto cb_fused_scale = tt::CB::c_in2;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_2;
     const uint32_t pre_scale = get_arg_val<uint32_t>(1);
     generate_bcast_unary_scalar(cb_fused_scale, pre_scale);
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp
index 45ed2cb50fc..2164eaa9a86 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp
@@ -51,7 +51,7 @@ void kernel_main() {
     uint32_t ht = start_ht;
     uint32_t mask_id = start_mask_id;
     bool read_mask = true;
-    constexpr auto cb_fused_scale = tt::CB::c_in3;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_3;
     const uint32_t pre_scale = get_arg_val<uint32_t>(2);
     generate_bcast_unary_scalar(cb_fused_scale, pre_scale);
     #endif
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm.cpp
index 7b33739f354..f274ad81b0d 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm.cpp
@@ -8,7 +8,7 @@
 
 // HW-bcast scale for fused scale-attn-softmax
 FORCE_INLINE void generate_inv_sqrt_hw_bcast_tile() {
-    constexpr auto cb_fused_scale = tt::CB::c_in2;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_2;
     uint32_t u = get_arg_val<uint32_t>(1);
     cb_reserve_back(cb_fused_scale, 1);
     auto ptr = reinterpret_cast<uint16_t*>(get_write_ptr(cb_fused_scale));
@@ -18,7 +18,7 @@ FORCE_INLINE void generate_inv_sqrt_hw_bcast_tile() {
 
 void kernel_main() {
 
-    constexpr uint32_t cb_reduce_scaler = tt::CB::c_in1;
+    constexpr uint32_t cb_reduce_scaler = tt::CBIndex::c_1;
     const uint32_t reduce_scaler = get_arg_val<uint32_t>(0);
 
     #if FUSED_SCALE_MASK
@@ -27,7 +27,7 @@ void kernel_main() {
     const uint32_t mask_addr  = get_arg_val<uint32_t>(2);
     const uint32_t mask_start_tile_id  = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t cb_attn = tt::CB::c_in3;
+    constexpr uint32_t cb_attn = tt::CBIndex::c_3;
     uint32_t mask_tile_bytes = get_tile_size(cb_attn);
     const DataFormat mask_data_format = get_dataformat(cb_attn);
     uint32_t mask_id = mask_start_tile_id;
@@ -38,7 +38,7 @@ void kernel_main() {
         .data_format = mask_data_format
     };
 
-    constexpr auto cb_fused_scale = tt::CB::c_in2;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_2;
     const uint32_t pre_scale = get_arg_val<uint32_t>(1);
     generate_bcast_unary_scalar(cb_fused_scale, pre_scale);
 
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm_rm_mask.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm_rm_mask.cpp
index 4c5b8601a1c..fdf556be983 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm_rm_mask.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_sharded_sm_rm_mask.cpp
@@ -15,7 +15,7 @@ void kernel_main() {
     const uint32_t mask_addr  = get_arg_val<uint32_t>(2);
     const uint32_t mask_start_tile_id  = get_arg_val<uint32_t>(3);
 
-    constexpr uint32_t cb_attn = tt::CB::c_in3;
+    constexpr uint32_t cb_attn = tt::CBIndex::c_3;
     uint32_t mask_tile_bytes = get_tile_size(cb_attn);
 
     #define stick_size_is_pow2 get_compile_time_arg_val(2) == 1
@@ -36,7 +36,7 @@ void kernel_main() {
     };
     #endif
 
-    constexpr auto cb_fused_scale = tt::CB::c_in2;
+    constexpr auto cb_fused_scale = tt::CBIndex::c_2;
     const uint32_t pre_scale = get_arg_val<uint32_t>(1);
     generate_bcast_unary_scalar(cb_fused_scale, pre_scale);
 
@@ -58,7 +58,7 @@ void kernel_main() {
     #endif
 
     {
-        constexpr uint32_t cb_reduce_scaler = tt::CB::c_in1;
+        constexpr uint32_t cb_reduce_scaler = tt::CBIndex::c_1;
         const uint32_t reduce_scaler = get_arg_val<uint32_t>(0);
         generate_reduce_scaler(cb_reduce_scaler, reduce_scaler);
     }
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp
index 885c558c41f..02619645dc4 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp
@@ -59,7 +59,7 @@ void kernel_main() {
     const uint32_t tile_bytes = get_tile_size(cb_id_out0);
     const DataFormat data_format = get_dataformat(cb_id_out0);
 
-    constexpr uint32_t cb_id_mask = tt::CB::c_in5;
+    constexpr uint32_t cb_id_mask = tt::CBIndex::c_5;
     const uint32_t mask_padded_data = get_arg_val<uint32_t>(4);
     const uint32_t num_datum_padded = get_arg_val<uint32_t>(5);
     const uint32_t val_to_pad = get_arg_val<uint32_t>(6);
diff --git a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
index 28ae322b1e3..b358b894a26 100644
--- a/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/normalization/softmax/device/multi_core/softmax_op_multi_core.cpp
@@ -113,7 +113,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
     uint32_t im2_t = 1;
     uint32_t im4_t = tt::div_up(Wt, block_size)*block_size;
 
-    // cb_exps - keeps exps in tt::CB in L1 to avoid recomputing
+    // cb_exps - keeps exps in tt::CBIndex in L1 to avoid recomputing
     uint32_t im0_t  = block_size*tt::div_up(Wt, block_size);
     TT_ASSERT(im0_t == Wt);
 
@@ -192,38 +192,38 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
     // Create circular buffers
     // see softmax.cpp for which buffers are needed
 
-    auto c_in0_config = CircularBufferConfig(in0_t * in0_tile_size, {{tt::CB::c_in0, in0_cb_data_format}}).set_page_size(tt::CB::c_in0, in0_tile_size);
+    auto c_in0_config = CircularBufferConfig(in0_t * in0_tile_size, {{tt::CBIndex::c_0, in0_cb_data_format}}).set_page_size(tt::CBIndex::c_0, in0_tile_size);
     auto cb_in0_id = CreateCircularBuffer( program, all_device_cores, c_in0_config);
-    auto c_out0_config = CircularBufferConfig(out0_t * out0_tile_size, {{tt::CB::c_out0, out0_cb_data_format}}).set_page_size(tt::CB::c_out0, out0_tile_size);
+    auto c_out0_config = CircularBufferConfig(out0_t * out0_tile_size, {{tt::CBIndex::c_16, out0_cb_data_format}}).set_page_size(tt::CBIndex::c_16, out0_tile_size);
     auto cb_out0_id = CreateCircularBuffer( program, all_device_cores, c_out0_config );
-    auto c_intermed1_config = CircularBufferConfig(im1_t * im_tile_size, {{tt::CB::c_intermed1, im_cb_data_format}}).set_page_size(tt::CB::c_intermed1, im_tile_size);
+    auto c_intermed1_config = CircularBufferConfig(im1_t * im_tile_size, {{tt::CBIndex::c_25, im_cb_data_format}}).set_page_size(tt::CBIndex::c_25, im_tile_size);
     auto cb_intermed1_id = CreateCircularBuffer( program, all_device_cores, c_intermed1_config );
-    auto c_in2_config = CircularBufferConfig(in2_t * scalar_tile_size, {{tt::CB::c_in2, scalar_cb_data_format}}).set_page_size(tt::CB::c_in2, scalar_tile_size);
+    auto c_in2_config = CircularBufferConfig(in2_t * scalar_tile_size, {{tt::CBIndex::c_2, scalar_cb_data_format}}).set_page_size(tt::CBIndex::c_2, scalar_tile_size);
     auto cb_in2_id = CreateCircularBuffer( program, all_device_cores, c_in2_config );
-    auto c_intermed0_config = CircularBufferConfig(im0_t * im_tile_size, {{tt::CB::c_intermed0, im_cb_data_format}}).set_page_size(tt::CB::c_intermed0, im_tile_size);
+    auto c_intermed0_config = CircularBufferConfig(im0_t * im_tile_size, {{tt::CBIndex::c_24, im_cb_data_format}}).set_page_size(tt::CBIndex::c_24, im_tile_size);
     auto cb_intermed0_id = CreateCircularBuffer( program, all_device_cores, c_intermed0_config );
     std::optional<CBHandle> cb_intermed3_id;
     std::optional<CBHandle> cb_in3_id;
     std::optional<CBHandle> cb_in4_id;
     std::optional<CBHandle> cb_in5_id;
     if (mask.has_value()) {
-        CircularBufferConfig c_intermed3_config = CircularBufferConfig(im3_t * im_tile_size, {{tt::CB::c_intermed3, im_cb_data_format}}).set_page_size(tt::CB::c_intermed3, im_tile_size);
+        CircularBufferConfig c_intermed3_config = CircularBufferConfig(im3_t * im_tile_size, {{tt::CBIndex::c_27, im_cb_data_format}}).set_page_size(tt::CBIndex::c_27, im_tile_size);
         cb_intermed3_id = CreateCircularBuffer( program, all_device_cores, c_intermed3_config );
-        CircularBufferConfig c_in3_config = CircularBufferConfig(in3_t * scalar_tile_size, {{tt::CB::c_in3, scalar_cb_data_format}}).set_page_size(tt::CB::c_in3, scalar_tile_size);
+        CircularBufferConfig c_in3_config = CircularBufferConfig(in3_t * scalar_tile_size, {{tt::CBIndex::c_3, scalar_cb_data_format}}).set_page_size(tt::CBIndex::c_3, scalar_tile_size);
         cb_in3_id = CreateCircularBuffer( program, all_device_cores, c_in3_config );
-        CircularBufferConfig c_in4_config = CircularBufferConfig(in4_t * mask_tile_size, {{tt::CB::c_in4, mask_cb_data_format}}).set_page_size(tt::CB::c_in4, mask_tile_size);
+        CircularBufferConfig c_in4_config = CircularBufferConfig(in4_t * mask_tile_size, {{tt::CBIndex::c_4, mask_cb_data_format}}).set_page_size(tt::CBIndex::c_4, mask_tile_size);
         cb_in4_id = CreateCircularBuffer( program, all_device_cores, c_in4_config);
     }
-    CircularBufferConfig c_in5_config = CircularBufferConfig(in5_t * mask_tile_size, {{tt::CB::c_in5, mask_cb_data_format}}).set_page_size(tt::CB::c_in5, mask_tile_size);
+    CircularBufferConfig c_in5_config = CircularBufferConfig(in5_t * mask_tile_size, {{tt::CBIndex::c_5, mask_cb_data_format}}).set_page_size(tt::CBIndex::c_5, mask_tile_size);
     cb_in5_id = CreateCircularBuffer( program, all_device_cores, c_in5_config);
     std::optional<CBHandle> cb_intermed2_id;
     std::optional<CBHandle> cb_intermed4_id;
     if (numeric_stable) {
         // cb_max
-        auto c_intermed2_config = CircularBufferConfig(im2_t * im_tile_size, {{tt::CB::c_intermed2, im_cb_data_format}}).set_page_size(tt::CB::c_intermed2, im_tile_size);
+        auto c_intermed2_config = CircularBufferConfig(im2_t * im_tile_size, {{tt::CBIndex::c_26, im_cb_data_format}}).set_page_size(tt::CBIndex::c_26, im_tile_size);
         cb_intermed2_id = CreateCircularBuffer( program, all_device_cores, c_intermed2_config );
         // cb_x
-        auto c_x_config = CircularBufferConfig(im4_t * im_tile_size, {{tt::CB::c_intermed4, im_cb_data_format}}).set_page_size(tt::CB::c_intermed4, im_tile_size);
+        auto c_x_config = CircularBufferConfig(im4_t * im_tile_size, {{tt::CBIndex::c_28, im_cb_data_format}}).set_page_size(tt::CBIndex::c_28, im_tile_size);
         cb_intermed4_id = CreateCircularBuffer( program, all_device_cores, c_x_config);
     }
 
@@ -340,7 +340,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_multi_core(
         uint32_t im2_t = 1;
         uint32_t im4_t = tt::div_up(Wt, block_size)*block_size;
 
-        // cb_exps - keeps exps in tt::CB in L1 to avoid recomputing
+        // cb_exps - keeps exps in tt::CBIndex in L1 to avoid recomputing
         uint32_t im0_t  = block_size*tt::div_up(Wt, block_size);
         TT_ASSERT(im0_t == Wt);
 
@@ -552,7 +552,7 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core(
     } else {
         in3_CB_size = block_wt * mask_tile_size;
     }
-    // cb_exps - keeps exps in tt::CB in L1 to avoid recomputing
+    // cb_exps - keeps exps in tt::CBIndex in L1 to avoid recomputing
     uint32_t im0_CB_size = block_wt * im_tile_size;
     // 1/sum(exp(x))
     uint32_t im1_CB_size = 1 * im_tile_size;
@@ -658,12 +658,12 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core(
 
     // Create circular buffers
     // in0 sharded
-    auto c_in0_config = CircularBufferConfig(in0_CB_size, {{tt::CB::c_in0, in0_cb_data_format}})
-        .set_page_size(tt::CB::c_in0, in0_tile_size).set_globally_allocated_address(*src0_buffer);
+    auto c_in0_config = CircularBufferConfig(in0_CB_size, {{tt::CBIndex::c_0, in0_cb_data_format}})
+        .set_page_size(tt::CBIndex::c_0, in0_tile_size).set_globally_allocated_address(*src0_buffer);
     auto cb_in0_id = CreateCircularBuffer(program, all_device_cores, c_in0_config);
     // in1 scalar
-    auto c_in1_config = CircularBufferConfig(in1_CB_size, {{tt::CB::c_in1, scalar_cb_data_format}})
-        .set_page_size(tt::CB::c_in1, scalar_tile_size);
+    auto c_in1_config = CircularBufferConfig(in1_CB_size, {{tt::CBIndex::c_1, scalar_cb_data_format}})
+        .set_page_size(tt::CBIndex::c_1, scalar_tile_size);
     auto cb_in1_id = CreateCircularBuffer(program, all_device_cores, c_in1_config);
     // in2 in3 attn scale mask
     std::optional<CBHandle> cb_intermed2_id;
@@ -671,45 +671,45 @@ operation::ProgramWithCallbacks scale_mask_softmax_sharded_multi_core(
     std::optional<CBHandle> cb_in3_id;
     if (mask.has_value()) {
         // im2
-        auto c_intermed2_config = CircularBufferConfig(im2_CB_size, {{tt::CB::c_intermed2, im_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed2, im_tile_size);
+        auto c_intermed2_config = CircularBufferConfig(im2_CB_size, {{tt::CBIndex::c_26, im_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_26, im_tile_size);
         cb_intermed2_id = CreateCircularBuffer( program, all_device_cores, c_intermed2_config );
         // in2 scale
-        auto c_in2_config = CircularBufferConfig(in2_CB_size, {{tt::CB::c_in2, scale_cb_data_format}})
-            .set_page_size(tt::CB::c_in2, scale_tile_size);
+        auto c_in2_config = CircularBufferConfig(in2_CB_size, {{tt::CBIndex::c_2, scale_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_2, scale_tile_size);
         cb_in2_id = CreateCircularBuffer(program, all_device_cores, c_in2_config);
         // in3 attn mask
         if (mask->is_sharded()) {
             auto mask_buffer = mask->buffer();
-            auto c_in3_config = CircularBufferConfig(in3_CB_size, {{tt::CB::c_in3, mask_cb_data_format}})
-                .set_page_size(tt::CB::c_in3, mask_tile_size).set_globally_allocated_address(*mask_buffer);
+            auto c_in3_config = CircularBufferConfig(in3_CB_size, {{tt::CBIndex::c_3, mask_cb_data_format}})
+                .set_page_size(tt::CBIndex::c_3, mask_tile_size).set_globally_allocated_address(*mask_buffer);
             cb_in3_id = CreateCircularBuffer( program, all_device_cores, c_in3_config);
         } else {
-            auto c_in3_config = CircularBufferConfig(in3_CB_size, {{tt::CB::c_in3, mask_cb_data_format}})
-                .set_page_size(tt::CB::c_in3, mask_tile_size);
+            auto c_in3_config = CircularBufferConfig(in3_CB_size, {{tt::CBIndex::c_3, mask_cb_data_format}})
+                .set_page_size(tt::CBIndex::c_3, mask_tile_size);
             cb_in3_id = CreateCircularBuffer( program, all_device_cores, c_in3_config);
         }
     }
     // out
-    auto c_out0_config = CircularBufferConfig(out_CB_size, {{tt::CB::c_out0, out0_cb_data_format}})
-        .set_page_size(tt::CB::c_out0, out0_tile_size).set_globally_allocated_address(*out0_buffer);
+    auto c_out0_config = CircularBufferConfig(out_CB_size, {{tt::CBIndex::c_16, out0_cb_data_format}})
+        .set_page_size(tt::CBIndex::c_16, out0_tile_size).set_globally_allocated_address(*out0_buffer);
     auto cb_out0_id = CreateCircularBuffer( program, all_device_cores, c_out0_config );
     // im0 for exp(x)
-    auto c_intermed0_config = CircularBufferConfig(im0_CB_size, {{tt::CB::c_intermed0, im_cb_data_format}})
-        .set_page_size(tt::CB::c_intermed0, im_tile_size);
+    auto c_intermed0_config = CircularBufferConfig(im0_CB_size, {{tt::CBIndex::c_24, im_cb_data_format}})
+        .set_page_size(tt::CBIndex::c_24, im_tile_size);
     auto cb_intermed0_id = CreateCircularBuffer( program, all_device_cores, c_intermed0_config );
     // im1 for 1/sum(exp(x))
-    auto c_intermed1_config = CircularBufferConfig(im1_CB_size, {{tt::CB::c_intermed1, im_cb_data_format}})
-        .set_page_size(tt::CB::c_intermed1, im_tile_size);
+    auto c_intermed1_config = CircularBufferConfig(im1_CB_size, {{tt::CBIndex::c_25, im_cb_data_format}})
+        .set_page_size(tt::CBIndex::c_25, im_tile_size);
     auto cb_intermed1_id = CreateCircularBuffer( program, all_device_cores, c_intermed1_config );
     if (numeric_stable) {
         // cb_max
-        auto c_intermed3_config = CircularBufferConfig(max_CB_size, {{tt::CB::c_intermed3, im_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed3, im_tile_size);
+        auto c_intermed3_config = CircularBufferConfig(max_CB_size, {{tt::CBIndex::c_27, im_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_27, im_tile_size);
         auto cb_intermed3_id = CreateCircularBuffer( program, all_device_cores, c_intermed3_config );
         // cb_x
-        auto c_intermed4_config = CircularBufferConfig(x_CB_size, {{tt::CB::c_intermed4, im_cb_data_format}})
-            .set_page_size(tt::CB::c_intermed4, im_tile_size);
+        auto c_intermed4_config = CircularBufferConfig(x_CB_size, {{tt::CBIndex::c_28, im_cb_data_format}})
+            .set_page_size(tt::CBIndex::c_28, im_tile_size);
         auto cb_intermed4_id = CreateCircularBuffer( program, all_device_cores, c_intermed4_config );
     }
 
diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp
index 50ac5678920..951deb5b8c2 100644
--- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_op.cpp
@@ -11,6 +11,7 @@
 #include "tt_metal/common/constants.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::downsample{
 
diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
index c0b6399660a..95757b17eda 100644
--- a/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/downsample/device/downsample_program_factory.cpp
@@ -15,6 +15,7 @@
 #include "tt_metal/host_api.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 
 namespace ttnn::operations::downsample::detail {
@@ -431,7 +432,7 @@ operation::ProgramWithCallbacks downsample_single_core(
     TT_ASSERT(output_shard_height % TILE_HEIGHT == 0);
     uint32_t num_rows_of_output_tiles = output_shard_height / TILE_HEIGHT;
 
-    uint32_t input_cb_index = tt::CB::c_in0;
+    uint32_t input_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_tiles = num_input_tiles_in_row * num_rows_of_input_tiles;
     tt::tt_metal::CircularBufferConfig input_cb_config =
         tt::tt_metal::CircularBufferConfig(
@@ -449,7 +450,7 @@ operation::ProgramWithCallbacks downsample_single_core(
 
     // CB to store halo data
     // hardcode to store 1 row of tiles
-    uint32_t halo_prev_input_cb_index = tt::CB::c_in1;
+    uint32_t halo_prev_input_cb_index = tt::CBIndex::c_1;
     uint32_t halo_prev_input_cb_max_rows_of_tiles = 4;
     uint32_t num_halo_prev_cb_input_tiles = num_input_tiles_in_row * halo_prev_input_cb_max_rows_of_tiles;
     tt::tt_metal::CircularBufferConfig halo_prev_input_cb_config =
@@ -465,7 +466,7 @@ operation::ProgramWithCallbacks downsample_single_core(
         num_halo_prev_cb_input_tiles,
         input_single_tile_size * num_halo_prev_cb_input_tiles);
 
-    uint32_t halo_next_input_cb_index = tt::CB::c_in2;
+    uint32_t halo_next_input_cb_index = tt::CBIndex::c_2;
     uint32_t halo_next_input_cb_max_rows_of_tiles = 33;  // TODO: Remove hardcoding
     uint32_t num_halo_next_cb_input_tiles = num_input_tiles_in_row * halo_next_input_cb_max_rows_of_tiles;
     tt::tt_metal::CircularBufferConfig halo_next_input_cb_config =
@@ -484,7 +485,7 @@ operation::ProgramWithCallbacks downsample_single_core(
     // CB to store reader pattern array
     // read pattern array size == output_height
     uint32_t reader_pattern_array_size = output_shard_height;
-    uint32_t reader_pattern_array_cb_index = tt::CB::c_intermed1;
+    uint32_t reader_pattern_array_cb_index = tt::CBIndex::c_25;
     tt::tt_metal::CircularBufferConfig reader_pattern_array_cb_config =
         tt::tt_metal::CircularBufferConfig(
             reader_pattern_array_size * 4, {{reader_pattern_array_cb_index, tt::DataFormat::Float16_b}})
@@ -499,7 +500,7 @@ operation::ProgramWithCallbacks downsample_single_core(
         4 * reader_pattern_array_size);
 
     // untilized CB has size - [32, full width]
-    uint32_t untilize_cb_index = tt::CB::c_intermed2;
+    uint32_t untilize_cb_index = tt::CBIndex::c_26;
     uint32_t num_tiles_untilize_cb = num_input_tiles_in_row;
     tt::tt_metal::CircularBufferConfig untilize_cb_config =
         tt::tt_metal::CircularBufferConfig(
@@ -515,7 +516,7 @@ operation::ProgramWithCallbacks downsample_single_core(
         untilized_single_tile_size * num_tiles_untilize_cb);
 
     uint32_t num_output_tiles = num_output_tiles_in_row * num_rows_of_output_tiles;
-    uint32_t untilize_downsampled_cb_index = tt::CB::c_intermed3;
+    uint32_t untilize_downsampled_cb_index = tt::CBIndex::c_27;
     uint32_t num_tiles_untilize_downsampled_cb =
         num_output_tiles;  // untilize downsampled cb size == output size per core
     tt::tt_metal::CircularBufferConfig untilize_downsampled_cb_config =
@@ -532,7 +533,7 @@ operation::ProgramWithCallbacks downsample_single_core(
         num_tiles_untilize_downsampled_cb,
         untilized_single_tile_size * num_tiles_untilize_downsampled_cb);
 
-    uint32_t final_tilize_output_cb_index = tt::CB::c_out0;
+    uint32_t final_tilize_output_cb_index = tt::CBIndex::c_16;
     uint32_t num_tiles_final_tilize_output_cb = num_output_tiles;  // final output cb size == output size per core
     tt::tt_metal::CircularBufferConfig final_tilize_output_cb_config =
         tt::tt_metal::CircularBufferConfig(
diff --git a/ttnn/cpp/ttnn/operations/pool/downsample/downsample.cpp b/ttnn/cpp/ttnn/operations/pool/downsample/downsample.cpp
index 71f2acd36f2..76a230e93a4 100644
--- a/ttnn/cpp/ttnn/operations/pool/downsample/downsample.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/downsample/downsample.cpp
@@ -5,6 +5,8 @@
 #include "downsample.hpp"
 #include "device/downsample_op.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::downsample {
 
 Tensor ExecuteDownsample::invoke(
diff --git a/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.cpp b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.cpp
similarity index 89%
rename from ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.cpp
rename to ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.cpp
index d6b67a146ff..6b1ae1eea97 100644
--- a/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
-#include "ttnn/operations/pool/avgpool/avg_pool.hpp"
+#include "ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp"
 #include "ttnn/operations/reduction/generic/generic_reductions.hpp"
 
 namespace tt {
@@ -22,7 +22,7 @@ Tensor pool_2d(const Tensor& input, const MemoryConfig& memory_config, const std
     }
 }
 
-Tensor avg_pool2d(const Tensor& input, const MemoryConfig& memory_config, const std::optional<DataType>& output_dtype) {
+Tensor global_avg_pool2d(const Tensor& input, const MemoryConfig& memory_config, const std::optional<DataType>& output_dtype) {
     TT_FATAL(input.storage_type() == StorageType::DEVICE, "Input tensor needs to be on device");
     auto output = input;
 
diff --git a/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.hpp b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp
similarity index 72%
rename from ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.hpp
rename to ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp
index 047c3bf7368..f4c22b5b805 100644
--- a/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool.hpp
+++ b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp
@@ -12,17 +12,17 @@
 namespace tt {
 namespace tt_metal {
 
-enum class PoolType {
-    AVG
-};
+enum class PoolType { AVG };
 
-Tensor avg_pool2d(const Tensor& input, const MemoryConfig& memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG, const std::optional<DataType>& output_dtype = std::nullopt);
+Tensor global_avg_pool2d(
+    const Tensor& input,
+    const MemoryConfig& memory_config = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,
+    const std::optional<DataType>& output_dtype = std::nullopt);
 
 }  // namespace tt_metal
 }  // namespace tt
 
-
-#include "ttnn/operations/pool/avgpool/avg_pool.hpp"
+#include "ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp"
 #include "ttnn/decorators.hpp"
 #include "ttnn/operations/core/core.hpp"
 
@@ -36,7 +36,7 @@ struct GlobalAveragePool2D {
         const std::optional<MemoryConfig>& memory_config_arg = std::nullopt,
         const std::optional<DataType>& output_dtype = std::nullopt) {
         auto memory_config = memory_config_arg.value_or(input.memory_config());
-        auto result = tt::tt_metal::avg_pool2d(input, memory_config, output_dtype);
+        auto result = tt::tt_metal::global_avg_pool2d(input, memory_config, output_dtype);
         return result;
     }
 };
diff --git a/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool_pybind.hpp b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool_pybind.hpp
similarity index 67%
rename from ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool_pybind.hpp
rename to ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool_pybind.hpp
index 6e71f05120c..320ff58ecc7 100644
--- a/ttnn/cpp/ttnn/operations/pool/avgpool/avg_pool_pybind.hpp
+++ b/ttnn/cpp/ttnn/operations/pool/global_avg_pool/global_avg_pool_pybind.hpp
@@ -8,7 +8,7 @@
 #include <pybind11/stl.h>
 
 #include "ttnn/cpp/pybind11/decorators.hpp"
-#include "ttnn/operations/pool/avgpool/avg_pool.hpp"
+#include "ttnn/operations/pool/global_avg_pool/global_avg_pool.hpp"
 #include "ttnn/types.hpp"
 
 namespace py = pybind11;
@@ -64,23 +64,6 @@ void bind_global_avg_pool2d(py::module& module) {
 
 void py_module(py::module& module) {
     detail::bind_global_avg_pool2d(module);
-    module.def(
-        "avg_pool2d",
-        &avg_pool2d,
-        py::arg().noconvert(),
-        py::kw_only(),
-        py::arg("memory_config").noconvert() = operation::DEFAULT_OUTPUT_MEMORY_CONFIG,
-        py::arg("dtype").noconvert() = std::nullopt,
-        R"doc(
-        Average Pool 2D
-        It operates on tensors that have channels as the last dimension.
-
-        +----------+----------------------------+------------+-------------------------------+----------+
-        | Argument | Description                | Data type  | Valid range                   | Required |
-        +==========+============================+============+===============================+==========+
-        | act      | Input activations tensor   | Tensor     |                               | Yes      |
-        +----------+----------------------------+------------+-------------------------------+----------+
-    )doc");
 }
 
 }  // namespace avgpool
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/compute/max_pool_multi_core.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/compute/max_pool_multi_core.cpp
index 4296c4c042b..7a7f1fc4e08 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/compute/max_pool_multi_core.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/compute/max_pool_multi_core.cpp
@@ -87,10 +87,10 @@ void MAIN {
     constexpr uint32_t in_c = get_compile_time_arg_val(14);
     constexpr uint32_t in_nblocks_c = get_compile_time_arg_val(15);
 
-    constexpr uint32_t in_cb_id = tt::CB::c_in0; // and tt::CB::c_in1 for split reader
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in4;
-    constexpr uint32_t in_tiled_cb_id = tt::CB::c_intermed0;
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
+    constexpr uint32_t in_cb_id = tt::CBIndex::c_0; // and tt::CBIndex::c_1 for split reader
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_4;
+    constexpr uint32_t in_tiled_cb_id = tt::CBIndex::c_24;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
 
     constexpr bool is_partial_tile = in_c < 32;
     static_assert((!is_partial_tile || (in_c == 16)), "Partial tile must have c_dim 16");
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/compute/max_pool_multi_core_large_kernel.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/compute/max_pool_multi_core_large_kernel.cpp
index 346a8ef8652..88d3b76ee20 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/compute/max_pool_multi_core_large_kernel.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/compute/max_pool_multi_core_large_kernel.cpp
@@ -96,11 +96,11 @@ void MAIN {
     constexpr uint32_t in_nblocks_c = get_compile_time_arg_val(15);
     constexpr uint32_t max_rows_for_reduction = get_compile_time_arg_val(16);
 
-    constexpr uint32_t in_cb_id = tt::CB::c_in0;  // and tt::CB::c_in1 for split reader
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in4;
-    constexpr uint32_t in_tiled_cb_id = tt::CB::c_intermed0;
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
-    constexpr uint32_t interm_reduction_cb_id = tt::CB::c_intermed1;
+    constexpr uint32_t in_cb_id = tt::CBIndex::c_0;  // and tt::CBIndex::c_1 for split reader
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_4;
+    constexpr uint32_t in_tiled_cb_id = tt::CBIndex::c_24;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
+    constexpr uint32_t interm_reduction_cb_id = tt::CBIndex::c_25;
 
     constexpr uint32_t MAX_TILES_PER_REDUCTION = 8;
 
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_large_kernel_v2.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_large_kernel_v2.cpp
index be922e6da3c..32a22a28446 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_large_kernel_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_large_kernel_v2.cpp
@@ -73,11 +73,11 @@ void kernel_main() {
     constexpr uint32_t MAX_ELE_PER_REDUCTION = 512; // TILE_WIDTH * 8 * numbytes
     constexpr uint32_t ROW_HW = 64;
 
-    constexpr uint32_t in_cb_id = (reader_id == 1) ? tt::CB::c_in1 : tt::CB::c_in0;
-    constexpr uint32_t in_shard_cb_id = tt::CB::c_in2;  // local input shard
-    constexpr uint32_t in_reader_indices_cb_id = tt::CB::c_in3;
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in4;
-    constexpr uint32_t interm_reduction_cb_id = tt::CB::c_intermed1;
+    constexpr uint32_t in_cb_id = (reader_id == 1) ? tt::CBIndex::c_1 : tt::CBIndex::c_0;
+    constexpr uint32_t in_shard_cb_id = tt::CBIndex::c_2;  // local input shard
+    constexpr uint32_t in_reader_indices_cb_id = tt::CBIndex::c_3;
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_4;
+    constexpr uint32_t interm_reduction_cb_id = tt::CBIndex::c_25;
 
     // minus infinity for bfp16
     uint16_t minus_inf = 63487;
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_v2.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_v2.cpp
index a313d7cf73d..4197dd633cf 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_v2.cpp
@@ -65,10 +65,10 @@ void kernel_main() {
 
     constexpr uint32_t TILE_WIDTH = 32;
 
-    constexpr uint32_t in_cb_id = (reader_id == 1) ? tt::CB::c_in1 : tt::CB::c_in0;
-    constexpr uint32_t in_shard_cb_id = tt::CB::c_in2;    // local input shard
-    constexpr uint32_t in_reader_indices_cb_id = tt::CB::c_in3;
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in4;
+    constexpr uint32_t in_cb_id = (reader_id == 1) ? tt::CBIndex::c_1 : tt::CBIndex::c_0;
+    constexpr uint32_t in_shard_cb_id = tt::CBIndex::c_2;    // local input shard
+    constexpr uint32_t in_reader_indices_cb_id = tt::CBIndex::c_3;
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_4;
 
     constexpr uint32_t ROW_HW = 64;
 
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_wide.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_wide.cpp
index c7bb703e645..2a1ead15314 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_wide.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/device/kernels/dataflow/reader_max_pool_2d_multi_core_sharded_with_halo_wide.cpp
@@ -68,10 +68,10 @@ void kernel_main() {
     constexpr uint32_t TILE_WIDTH = 32;
     constexpr uint32_t MAX_ELE_PER_REDUCTION = 512; // TILE_WIDTH * 8 * numbytes
 
-    constexpr uint32_t in_cb_id = (reader_id == 1) ? tt::CB::c_in1 : tt::CB::c_in0;
-    constexpr uint32_t in_shard_cb_id = tt::CB::c_in2;    // local input shard
-    constexpr uint32_t in_reader_indices_cb_id = tt::CB::c_in3;
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in4;
+    constexpr uint32_t in_cb_id = (reader_id == 1) ? tt::CBIndex::c_1 : tt::CBIndex::c_0;
+    constexpr uint32_t in_shard_cb_id = tt::CBIndex::c_2;    // local input shard
+    constexpr uint32_t in_reader_indices_cb_id = tt::CBIndex::c_3;
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_4;
 
     constexpr uint32_t ROW_HW = 64;
 
diff --git a/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp
index afc92ff0316..173d3077126 100644
--- a/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/maxpool/device/max_pool2d_multi_core_program_factory.cpp
@@ -107,7 +107,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     uint32_t split_reader = 1;
 
     // scalar CB as coefficient of reduce
-    uint32_t in_scalar_cb_id = tt::CB::c_in4;
+    uint32_t in_scalar_cb_id = tt::CBIndex::c_4;
     uint32_t in_scalar_cb_pagesize = tile_size(in_df);
     uint32_t in_scalar_cb_npages = 1;
     CircularBufferConfig in_scalar_cb_config =
@@ -118,7 +118,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
 
     // incoming data is the input cb instead of raw l1/dram addr
     // this input shard has halo and padding inserted.
-    auto raw_in_cb_id = tt::CB::c_in2;
+    auto raw_in_cb_id = tt::CBIndex::c_2;
     uint32_t raw_in_cb_npages = input.shard_spec().value().shape[0];
     uint32_t raw_in_cb_pagesize = in_nbytes_c;
     CircularBufferConfig raw_in_cb_config =
@@ -129,7 +129,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     log_debug(tt::LogOp, "CB {} :: PS = {}, NP = {}", raw_in_cb_id, raw_in_cb_pagesize, raw_in_cb_npages);
 
     // reader indices
-    auto in_reader_indices_cb_id = tt::CB::c_in3;
+    auto in_reader_indices_cb_id = tt::CBIndex::c_3;
     uint32_t in_reader_indices_cb_pagesize =
         tt::round_up(out_nhw_per_core * indices_nbytes, 4);  // pagesize needs to be multiple of 4
     uint32_t in_reader_indices_cb_npages = 1;
@@ -165,8 +165,8 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
         }
     }
     // reader output == input to tilize
-    uint32_t in_cb_id_0 = tt::CB::c_in0;  // input rows for "multiple (out_nelems)" output pixels
-    uint32_t in_cb_id_1 = tt::CB::c_in1;  // input rows for "multiple (out_nelems)" output pixels
+    uint32_t in_cb_id_0 = tt::CBIndex::c_0;  // input rows for "multiple (out_nelems)" output pixels
+    uint32_t in_cb_id_1 = tt::CBIndex::c_1;  // input rows for "multiple (out_nelems)" output pixels
     uint32_t in_cb_page_padded = ceil_multiple_of(
         in_cb_sz,
         tt::constants::TILE_HW);  // NOTE: ceil to tile size since triscs work with tilesize instead of pagesize
@@ -186,7 +186,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     }
 
     // output of tilize == input to reduce
-    uint32_t in_tiled_cb_id = tt::CB::c_intermed0;  // tiled input
+    uint32_t in_tiled_cb_id = tt::CBIndex::c_24;  // tiled input
     uint32_t in_tiled_cb_pagesize = tile_size(in_df);
     uint32_t in_tiled_cb_npages = in_ntiles_c * in_ntiles_hw * nblocks;
     CircularBufferConfig in_tiled_cb_config =
@@ -197,7 +197,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
 
 
     // output of reduce == writer to write
-    uint32_t out_cb_id = tt::CB::c_out0;  // output rows in RM
+    uint32_t out_cb_id = tt::CBIndex::c_16;  // output rows in RM
     // after reduction
     uint32_t out_cb_pagesize = output.shard_spec().value().shape[1] * out_nbytes / in_nblocks_c;  // there is just one row of channels after each reduction (or 1 block of c if its greater than 8 tiles)
     uint32_t out_cb_npages = output.shard_spec().value().shape[0] * in_nblocks_c;
@@ -209,7 +209,7 @@ MaxPool2D::MultiCore::cached_program_t max_pool_2d_multi_core_sharded_with_halo_
     log_debug(tt::LogOp, "CB {} :: PS = {}, NP = {}", out_cb_id, out_cb_pagesize, out_cb_npages);
 
     if (is_large_kernel) {
-        uint32_t max_pool_partials_cb_id = tt::CB::c_intermed1;  // max_pool partials
+        uint32_t max_pool_partials_cb_id = tt::CBIndex::c_25;  // max_pool partials
         uint32_t max_pool_partials_cb_pagesize = std::min(out_cb_pagesize, TILE_SIZE * 8 * out_nbytes);
         uint32_t max_pool_partials_cb_npages = nblocks;
         CircularBufferConfig max_pool_partials_cb_config =
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/compute/bilinear.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/compute/bilinear.cpp
index c845f21f8b2..a7f50cae2e4 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/compute/bilinear.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/compute/bilinear.cpp
@@ -35,11 +35,11 @@ inline void reduce_h_fused(
 
 namespace NAMESPACE{
 void MAIN{
-    constexpr uint32_t out_cb_id = tt::CB::c_out0;
-    constexpr uint32_t in1_cb_id = tt::CB::c_in1;
-    constexpr uint32_t bias_cb_id = tt::CB::c_in2;
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in4;
-    constexpr uint32_t in2_cb_id = tt::CB::c_intermed0;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_16;
+    constexpr uint32_t in1_cb_id = tt::CBIndex::c_1;
+    constexpr uint32_t bias_cb_id = tt::CBIndex::c_2;
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_4;
+    constexpr uint32_t in2_cb_id = tt::CBIndex::c_24;
 
     constexpr uint32_t in_ntiles_hw = get_compile_time_arg_val(0);
     constexpr uint32_t in_ntiles_c = get_compile_time_arg_val(1);
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp
index 89edffbc171..f7d937147e1 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp
@@ -39,7 +39,7 @@ void kernel_main() {
     constexpr bool src1_is_dram = false;
 
     constexpr uint32_t in_cb_id = get_compile_time_arg_val(0);
-    constexpr uint32_t out_cb_id = tt::CB::c_in1;
+    constexpr uint32_t out_cb_id = tt::CBIndex::c_1;
     //constexpr uint32_t is_reader = get_compile_time_arg_val(2);
     constexpr uint32_t scale_h_inv_comp = get_compile_time_arg_val(3);
     constexpr uint32_t scale_w_inv_comp = get_compile_time_arg_val(4);
@@ -47,7 +47,7 @@ void kernel_main() {
     constexpr uint32_t x_index_compute_comp = get_compile_time_arg_val(6);
 
     uint32_t l1_read_addr = get_read_ptr(in_cb_id);
-    constexpr uint32_t in_scalar_cb_id = tt::CB::c_in4;
+    constexpr uint32_t in_scalar_cb_id = tt::CBIndex::c_4;
 
     // assuming shard begins with a new row. TODO: generalize?
     float scale_h_inv = uint32_to_float(scale_h_inv_comp);
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
index 748d57c69f9..f910f1e1c77 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
@@ -130,7 +130,7 @@ operation::ProgramWithCallbacks bilinear_multi_core(const Tensor &input, Tensor&
     uint32_t buffering_factor = 1;  // data is already fully buffered in the CBs since its sharded
 
     // input data is in a sharded CB
-    uint32_t in_cb_id = CB::c_in0;
+    uint32_t in_cb_id = CBIndex::c_0;
     uint32_t aligned_input_stick_nbytes = round_up_to_mul32(input_stick_nbytes);
     uint32_t in_cb_pagesize = aligned_input_stick_nbytes;
     uint32_t in_cb_npages = halo_shard_shape[0] * buffering_factor;
@@ -142,7 +142,7 @@ operation::ProgramWithCallbacks bilinear_multi_core(const Tensor &input, Tensor&
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
     //intermediate tensor CB
-    uint32_t in1_cb_id = CB::c_in1;
+    uint32_t in1_cb_id = CBIndex::c_1;
     CircularBufferConfig cb_src1_config = CircularBufferConfig(
                                             4 * in_cb_pagesize,  //since 4 pixels per page are needed for intermediate tensor.
                                             {{in1_cb_id, input_cb_data_format}})
@@ -150,7 +150,7 @@ operation::ProgramWithCallbacks bilinear_multi_core(const Tensor &input, Tensor&
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src1_config);
 
     //scaler CB
-    uint32_t in_scalar_cb_id = CB::c_in4;
+    uint32_t in_scalar_cb_id = CBIndex::c_4;
     uint32_t in_scalar_cb_pagesize = tile_size(input_cb_data_format);
     uint32_t in_scalar_cb_npages = 1;
     CircularBufferConfig in_scalar_cb_config =
@@ -161,7 +161,7 @@ operation::ProgramWithCallbacks bilinear_multi_core(const Tensor &input, Tensor&
     auto in_scalar_cb = tt_metal::CreateCircularBuffer(program, all_cores, in_scalar_cb_config);
 
     // output sharded CB with upsampled data
-    uint32_t out_cb_id = CB::c_out0;
+    uint32_t out_cb_id = CBIndex::c_16;
     uint32_t aligned_output_stick_nbytes = round_up_to_mul32(output_stick_nbytes);
     uint32_t out_cb_pagesize = aligned_output_stick_nbytes;
     uint32_t out_cb_npages = output_nsticks_per_core * buffering_factor;
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
index 43ff0989254..e98898ee1ab 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
@@ -15,6 +15,7 @@
 
 namespace ttnn::operations::upsample {
 using namespace tt;
+using namespace tt::tt_metal;
 
 void UpSample::validate(const std::vector<Tensor> &input_tensors) const {
     const auto& input_tensor_a = input_tensors.at(0);
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
index b2deccc8f2f..0e12adcb29a 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_multicore.cpp
@@ -15,6 +15,7 @@
 #include "tt_metal/tt_stl/reflection.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::upsample {
 using namespace tt;
@@ -76,7 +77,7 @@ operation::ProgramWithCallbacks upsample_multi_core(const Tensor &input, Tensor&
     uint32_t buffering_factor = 1;  // data is already fully buffered in the CBs since its sharded
 
     // input data is in a sharded CB
-    uint32_t in_cb_id = CB::c_in0;
+    uint32_t in_cb_id = CBIndex::c_0;
     uint32_t aligned_input_stick_nbytes = round_up_to_mul32(input_stick_nbytes);
     uint32_t in_cb_pagesize = aligned_input_stick_nbytes;
     uint32_t in_cb_npages = input_nsticks_per_core * buffering_factor;
@@ -88,7 +89,7 @@ operation::ProgramWithCallbacks upsample_multi_core(const Tensor &input, Tensor&
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
     // output sharded CB with upsampled data
-    uint32_t out_cb_id = CB::c_out0;
+    uint32_t out_cb_id = CBIndex::c_16;
     uint32_t aligned_output_stick_nbytes = round_up_to_mul32(output_stick_nbytes);
     uint32_t out_cb_pagesize = aligned_output_stick_nbytes;
     uint32_t out_cb_npages = output_nsticks_per_core * buffering_factor;
diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_singlecore.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_singlecore.cpp
index 401cb1074a3..297be2d1fde 100644
--- a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_singlecore.cpp
+++ b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_program_factory_singlecore.cpp
@@ -15,6 +15,7 @@
 #include "tt_metal/tt_stl/reflection.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::upsample {
 using namespace tt;
@@ -35,7 +36,7 @@ operation::ProgramWithCallbacks upsample_single_core(const Tensor &input, Tensor
     tt_metal::Device *device = output.device();
 
     //circulat buffer for input
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     uint32_t num_input_units = 2;
     uint32_t aligned_input_unit_size = round_up_to_mul32(input_unit_size);
     tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_units * aligned_input_unit_size, {{src0_cb_index, input_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.cpp b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.cpp
index 7acbd31c2c2..308e42dc351 100644
--- a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_op.cpp
@@ -5,6 +5,8 @@
 #include "argmax_op.hpp"
 #include "argmax_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::reduction {
 
 void ArgMax::validate_with_output_tensors(
diff --git a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.cpp
index b418e4d1a90..d1dc10b45ea 100644
--- a/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/argmax/device/argmax_program_factory.cpp
@@ -10,6 +10,8 @@
 #include "tt_metal/host_api.hpp"
 #include "ttnn/operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::reduction::detail {
 
 using namespace tt::constants;
@@ -38,7 +40,7 @@ operation::ProgramWithCallbacks argmax_single_core(
     const uint32_t H = input_shape[2];
     const uint32_t W = input_shape[3];
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_units = W;
     uint32_t aligned_input_unit_size = round_up_to_mul32(num_input_units * input_unit_size);
     tt::tt_metal::CircularBufferConfig cb_src0_config =
@@ -46,7 +48,7 @@ operation::ProgramWithCallbacks argmax_single_core(
             .set_page_size(src0_cb_index, aligned_input_unit_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t intermed0_cb_index = tt::CB::c_intermed0;
+    uint32_t intermed0_cb_index = tt::CBIndex::c_24;
     uint32_t num_intermed0_units = B*C*H;
     uint32_t aligned_intermed0_unit_size = num_intermed0_units * output_unit_size;
     tt::tt_metal::CircularBufferConfig intermed0_cb_config =
@@ -137,7 +139,7 @@ operation::ProgramWithCallbacks argmax_multi_core(
     const uint32_t H = input_shape[2];
     const uint32_t W = input_shape[3];
 
-    uint32_t src0_cb_index = tt::CB::c_in0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     uint32_t num_input_units = W;
     uint32_t aligned_input_unit_size = round_up_to_mul32(num_input_units * input_unit_size);
     tt::tt_metal::CircularBufferConfig cb_src0_config =
@@ -145,7 +147,7 @@ operation::ProgramWithCallbacks argmax_multi_core(
             .set_page_size(src0_cb_index, aligned_input_unit_size);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
-    uint32_t intermed0_cb_index = tt::CB::c_intermed0;
+    uint32_t intermed0_cb_index = tt::CBIndex::c_24;
     uint32_t num_intermed0_units = B*C*H;
     uint32_t aligned_intermed0_unit_size = num_intermed0_units * output_unit_size;
     tt::tt_metal::CircularBufferConfig intermed0_cb_config =
@@ -154,7 +156,7 @@ operation::ProgramWithCallbacks argmax_multi_core(
             .set_page_size(intermed0_cb_index, aligned_intermed0_unit_size);  /// page size shouldn't matter here
     auto cb_intermed0 = tt::tt_metal::CreateCircularBuffer(program, all_cores, intermed0_cb_config);
 
-    uint32_t intermed1_cb_index = tt::CB::dataflow0;
+    uint32_t intermed1_cb_index = tt::CBIndex::c_8;
     uint32_t num_intermed1_units = B*C*H;
     uint32_t aligned_intermed1_unit_size = round_up_to_mul32(num_intermed1_units * input_unit_size);
     tt::tt_metal::CircularBufferConfig intermed1_cb_config =
@@ -163,7 +165,7 @@ operation::ProgramWithCallbacks argmax_multi_core(
             .set_page_size(intermed1_cb_index, aligned_intermed1_unit_size);  /// page size shouldn't matter here
     auto cb_intermed1 = tt::tt_metal::CreateCircularBuffer(program, all_cores, intermed1_cb_config);
 
-    uint32_t out0_cb_index = tt::CB::c_out0;
+    uint32_t out0_cb_index = tt::CBIndex::c_16;
     uint32_t num_out0_units = B*C*H;
     uint32_t aligned_out0_unit_size = num_out0_units * output_unit_size;
     tt::tt_metal::CircularBufferConfig out0_cb_config =
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_h.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_h.cpp
index 21e7a4f704e..4a81f53bef3 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_h.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_h.cpp
@@ -13,8 +13,8 @@ void MAIN {
     uint32_t Wt = get_compile_time_arg_val(1);
     uint32_t NC = get_compile_time_arg_val(2);
 
-    reduce_init<true>(tt::CB::c_in0, tt::CB::c_in2);
-    cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
+    reduce_init<true>(tt::CBIndex::c_0, tt::CBIndex::c_2);
+    cb_wait_front(tt::CBIndex::c_2, 1); // scaler tile from the reader
 
     for (uint32_t nc = 0; nc < NC; nc++) {
 
@@ -26,15 +26,15 @@ void MAIN {
             // in this case we just sequentially add to accumulator all the H-tiles in a column
             acquire_dst();
             for(uint32_t ht = 0; ht < Ht; ++ht) {
-                cb_wait_front(tt::CB::c_in0, onetile);
+                cb_wait_front(tt::CBIndex::c_0, onetile);
                 // REDUCE_OP is expected to come from add_define
-                reduce_tile(tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
-                cb_pop_front(tt::CB::c_in0, onetile);
+                reduce_tile(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0, reduce_dst_idx);
+                cb_pop_front(tt::CBIndex::c_0, onetile);
             }
 
-            cb_reserve_back(tt::CB::c_out0, onetile);
-            pack_tile(reduce_dst_idx, tt::CB::c_out0);
-            cb_push_back(tt::CB::c_out0, onetile);
+            cb_reserve_back(tt::CBIndex::c_16, onetile);
+            pack_tile(reduce_dst_idx, tt::CBIndex::c_16);
+            cb_push_back(tt::CBIndex::c_16, onetile);
             release_dst();
         }
     }
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_hw.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_hw.cpp
index e493d76ab02..3f5e15728a9 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_hw.cpp
@@ -13,9 +13,9 @@ void MAIN {
     uint32_t Wt = get_compile_time_arg_val(1);
     uint32_t NC = get_compile_time_arg_val(2);
 
-    reduce_init<true>(tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init<true>(tt::CBIndex::c_0, tt::CBIndex::c_2);
 
-    cb_wait_front(tt::CB::c_in2, 1); // scaler tile from the reader
+    cb_wait_front(tt::CBIndex::c_2, 1); // scaler tile from the reader
     for (uint32_t nc = 0; nc < NC; nc++) {
         constexpr int onetile = 1;
         int reduce_dst_idx = 0;
@@ -25,15 +25,15 @@ void MAIN {
             // reducing in W means out[h][0] = sum(w=0..W-1, in[h][w])
             // in this case we just sequentially add to accumulator all the W-tiles in a row
             for(uint32_t wt = 0; wt < Wt; ++wt) {
-                cb_wait_front(tt::CB::c_in0, onetile);
+                cb_wait_front(tt::CBIndex::c_0, onetile);
                 // REDUCE_OP/DIM is expected to come from add_define
-                reduce_tile(tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
-                cb_pop_front(tt::CB::c_in0, onetile);
+                reduce_tile(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0, reduce_dst_idx);
+                cb_pop_front(tt::CBIndex::c_0, onetile);
             }
         }
-        cb_reserve_back(tt::CB::c_out0, onetile);
-        pack_tile(reduce_dst_idx, tt::CB::c_out0);
-        cb_push_back(tt::CB::c_out0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
+        pack_tile(reduce_dst_idx, tt::CBIndex::c_16);
+        cb_push_back(tt::CBIndex::c_16, onetile);
         release_dst();
     }
 }
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_w.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_w.cpp
index 95f7f9ce1dc..352f1b7c177 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_w.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/kernels/compute/reduce_w.cpp
@@ -19,12 +19,12 @@ void MAIN {
     uint32_t NC = get_compile_time_arg_val(2);
 
 #ifndef REDUCE_ROW_SUM_VIA_MM
-    reduce_init<true>(tt::CB::c_in0, tt::CB::c_in2);
+    reduce_init<true>(tt::CBIndex::c_0, tt::CBIndex::c_2);
 #else
-    mm_init(tt::CB::c_in0, tt::CB::c_in2);
+    mm_init(tt::CBIndex::c_0, tt::CBIndex::c_2);
 #endif
 
-    cb_wait_front(tt::CB::c_in2, 1);  // scaler tile from the reader
+    cb_wait_front(tt::CBIndex::c_2, 1);  // scaler tile from the reader
     for (uint32_t nc = 0; nc < NC; nc++) {
         constexpr int onetile = 1;
         int reduce_dst_idx = 0;
@@ -34,19 +34,19 @@ void MAIN {
             // in this case we just sequentially add to accumulator all the W-tiles in a row
             acquire_dst();
             for(uint32_t wt = 0; wt < Wt; ++wt) {
-                cb_wait_front(tt::CB::c_in0, onetile);
+                cb_wait_front(tt::CBIndex::c_0, onetile);
                 // REDUCE_OP is expected to come from add_define
 #ifndef REDUCE_ROW_SUM_VIA_MM
-                reduce_tile(tt::CB::c_in0, tt::CB::c_in2, 0, 0, reduce_dst_idx);
+                reduce_tile(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0, reduce_dst_idx);
 #else
-                matmul_tiles(tt::CB::c_in0, tt::CB::c_in2, 0, 0, 0, false);
+                matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_2, 0, 0, 0, false);
 #endif
-                cb_pop_front(tt::CB::c_in0, onetile);
+                cb_pop_front(tt::CBIndex::c_0, onetile);
             }
 
-            cb_reserve_back(tt::CB::c_out0, onetile);
-            pack_tile(reduce_dst_idx, tt::CB::c_out0);
-            cb_push_back(tt::CB::c_out0, onetile);
+            cb_reserve_back(tt::CBIndex::c_16, onetile);
+            pack_tile(reduce_dst_idx, tt::CBIndex::c_16);
+            cb_push_back(tt::CBIndex::c_16, onetile);
             release_dst();
         }
     }
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
index d240f714a17..f8d8db2393a 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_h/reduce_op_multi_core_h.cpp
@@ -62,9 +62,9 @@ operation::ProgramWithCallbacks reduce_multi_core_h(
         num_cols_per_core_group_2 = 0;
     }
 
-    uint32_t src0_cb_index = CB::c_in0;
+    uint32_t src0_cb_index = CBIndex::c_0;
     CBHandle cb_src0;
-    uint32_t src1_cb_index = CB::c_in1;
+    uint32_t src1_cb_index = CBIndex::c_1;
     CBHandle cb_src1 = 0;
     if (in_sharded) {
         uint32_t num_shard_tiles = a.shard_spec().value().numel() / TILE_HW;
@@ -90,13 +90,13 @@ operation::ProgramWithCallbacks reduce_multi_core_h(
         cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
     }
 
-    uint32_t scaler_cb_index = CB::c_in2;
+    uint32_t scaler_cb_index = CBIndex::c_2;
     tt_metal::CircularBufferConfig cb_scaler_config =
         tt_metal::CircularBufferConfig(1 * scaler_single_tile_size, {{scaler_cb_index, scaler_cb_data_format}})
             .set_page_size(scaler_cb_index, scaler_single_tile_size);
     auto cb_scaler = tt_metal::CreateCircularBuffer(program, all_cores, cb_scaler_config);
 
-    uint32_t output_cb_index = CB::c_out0;  // output operands start at index 16
+    uint32_t output_cb_index = CBIndex::c_16;  // output operands start at index 16
     CBHandle cb_output;
     if (out_sharded) {
         uint32_t num_output_tiles = output.shard_spec().value().numel() / TILE_HW;
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
index 414755689bf..ab6693d237b 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/multi_core_w/reduce_op_multi_core_w.cpp
@@ -60,11 +60,11 @@ operation::ProgramWithCallbacks reduce_multi_core_w(
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, all_cores, cb_src0_config);
 
     tt_metal::CircularBufferConfig cb_scaler_config =
-        tt_metal::CircularBufferConfig(num_input_tiles * scaler_single_tile_size, {{CB::c_in2, scaler_cb_data_format}})
-            .set_page_size(CB::c_in2, scaler_single_tile_size);
+        tt_metal::CircularBufferConfig(num_input_tiles * scaler_single_tile_size, {{CBIndex::c_2, scaler_cb_data_format}})
+            .set_page_size(CBIndex::c_2, scaler_single_tile_size);
     auto cb_scaler = tt_metal::CreateCircularBuffer(program, all_cores, cb_scaler_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig cb_output_config =
         tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp b/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
index 66eaccf2e20..c688c8f11c5 100644
--- a/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/generic/device/single_core_hw/reduce_op_single_core_hw.cpp
@@ -63,11 +63,11 @@ operation::ProgramWithCallbacks reduce_single_core_hw(
     auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
     tt_metal::CircularBufferConfig cb_scaler_config =
-        tt_metal::CircularBufferConfig(num_input_tiles * scaler_single_tile_size, {{CB::c_in2, scaler_cb_data_format}})
-            .set_page_size(CB::c_in2, scaler_single_tile_size);
+        tt_metal::CircularBufferConfig(num_input_tiles * scaler_single_tile_size, {{CBIndex::c_2, scaler_cb_data_format}})
+            .set_page_size(CBIndex::c_2, scaler_single_tile_size);
     auto cb_src1 = tt_metal::CreateCircularBuffer(program, core, cb_scaler_config);
 
-    uint32_t output_cb_index = 16;  // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig cb_output_config =
         tt_metal::CircularBufferConfig(num_output_tiles * dst_single_tile_size, {{output_cb_index, dst_cb_data_format}})
diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/device/kernels/dataflow/writer_unary_interleaved.cpp b/ttnn/cpp/ttnn/operations/reduction/moe/device/kernels/dataflow/writer_unary_interleaved.cpp
index fc1c48c44a6..9b1a72e7b5c 100644
--- a/ttnn/cpp/ttnn/operations/reduction/moe/device/kernels/dataflow/writer_unary_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/moe/device/kernels/dataflow/writer_unary_interleaved.cpp
@@ -21,7 +21,7 @@ void kernel_main() {
     const DataFormat data_format = get_dataformat(out_cb_index);
 
     // Reduce ops need to multiply by a scalar. We always want to multiply by 1.0f
-    constexpr uint32_t scale_cb_index = tt::CB::c_in3;
+    constexpr uint32_t scale_cb_index = tt::CBIndex::c_3;
     generate_reduce_scaler(scale_cb_index, packed_identity_scalar);
 
     const InterleavedAddrGenFast<out_is_dram> interleaved_accessor0 = {
diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_op.cpp b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_op.cpp
index 42d1532c523..ef8e8377420 100644
--- a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_op.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_op.cpp
@@ -5,6 +5,8 @@
 #include "moe_op.hpp"
 #include "moe_program_factory.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::reduction {
 
 void MoeDeviceOperation::validate_with_output_tensors(
diff --git a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.cpp
index 8bc43b3ab95..8784b90072a 100644
--- a/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/moe/device/moe_program_factory.cpp
@@ -9,6 +9,8 @@
 #include "tt_metal/common/math.hpp"
 #include "ttnn/operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::reduction::detail {
 
 operation::ProgramWithCallbacks moe_single_core_interleaved(const Tensor &input_tensor, const Tensor &expert_mask_tensor, const Tensor &topk_mask_tensor, const uint16_t k, Tensor &out_tensor) {
@@ -57,80 +59,80 @@ operation::ProgramWithCallbacks moe_single_core_interleaved(const Tensor &input_
 
     // INPUT CBs
     // Two tiles are loaded in for topk_local_sort at a time, and we double buffer to avoid stalls, so allocate four tiles of space
-    uint32_t input_cb_index = tt::CB::c_in0;
+    uint32_t input_cb_index = tt::CBIndex::c_0;
     tt::tt_metal::CircularBufferConfig input_cb_config = tt::tt_metal::CircularBufferConfig(
         cb_in_units * input_tile_size, {{input_cb_index, input_cb_data_format}})
 		.set_page_size(input_cb_index, input_tile_size);
     auto cb_input_tensor = tt::tt_metal::CreateCircularBuffer(program, core, input_cb_config);
 
-    uint32_t expert_mask_cb_index = tt::CB::c_in1;
+    uint32_t expert_mask_cb_index = tt::CBIndex::c_1;
     tt::tt_metal::CircularBufferConfig expert_mask_cb_config = tt::tt_metal::CircularBufferConfig(
         cb_in_units * expert_mask_tile_size, {{expert_mask_cb_index, expert_mask_cb_data_format}})
         .set_page_size(expert_mask_cb_index, expert_mask_tile_size);
     auto cb_expert_mask_tensor = tt::tt_metal::CreateCircularBuffer(program, core, expert_mask_cb_config);
 
-    uint32_t topk_mask_cb_index = tt::CB::c_in2;
+    uint32_t topk_mask_cb_index = tt::CBIndex::c_2;
     tt::tt_metal::CircularBufferConfig topk_mask_cb_config = tt::tt_metal::CircularBufferConfig(
         cb_in_units * topk_mask_tile_size, {{topk_mask_cb_index, topk_mask_cb_data_format}})
         .set_page_size(topk_mask_cb_index, topk_mask_tile_size);
     auto cb_topk_mask_tensor = tt::tt_metal::CreateCircularBuffer(program, core, topk_mask_cb_config);
 
     // identity scale input
-    uint32_t scale_cb_index = tt::CB::c_in3;
+    uint32_t scale_cb_index = tt::CBIndex::c_3;
     tt::tt_metal::CircularBufferConfig scale_cb_config = tt::tt_metal::CircularBufferConfig(scale_tiles * scalar_tile_size, {{scale_cb_index, scalar_df}}).set_page_size(scale_cb_index, scalar_tile_size);
     auto scale_cb_tensor = tt::tt_metal::CreateCircularBuffer(program, core, scale_cb_config);
 
     // TOP K CBs
     // Two tiles are loaded in for topk_local_sort at a time, and we double buffer to avoid stalls, so allocate four tiles of space
     // This CB carries the indices that are created in the reader kernel
-    uint32_t index_cb_index = tt::CB::c_intermed0;
+    uint32_t index_cb_index = tt::CBIndex::c_24;
     tt::tt_metal::CircularBufferConfig index_input_intermed0_config = tt::tt_metal::CircularBufferConfig(
         cb_in_units * index_tile_size, {{index_cb_index, index_cb_data_format}})
 		.set_page_size(index_cb_index, index_tile_size);
     auto cb_index_tensor = tt::tt_metal::CreateCircularBuffer(program, core, index_input_intermed0_config);
 
     // Single buffered circular buffer that holds the transposed input tiles
-    uint32_t input_transposed_cb_index = tt::CB::c_intermed1;
+    uint32_t input_transposed_cb_index = tt::CBIndex::c_25;
     tt::tt_metal::CircularBufferConfig input_transposed_cb_config = tt::tt_metal::CircularBufferConfig(
          Wt * value_tile_size, {{input_transposed_cb_index, input_cb_data_format}})
 		.set_page_size(input_transposed_cb_index, input_tile_size);
     auto cb_input_transposed_tiles = tt::tt_metal::CreateCircularBuffer(program, core, input_transposed_cb_config);
 
     // Single buffered circular buffer that holds the transposed index tiles
-    uint32_t index_transposed_cb_index = tt::CB::c_intermed2;
+    uint32_t index_transposed_cb_index = tt::CBIndex::c_26;
     tt::tt_metal::CircularBufferConfig index_transposed_cb_config = tt::tt_metal::CircularBufferConfig(
          Wt * index_tile_size, {{index_transposed_cb_index, index_cb_data_format}})
 		.set_page_size(index_transposed_cb_index, index_tile_size);
     auto cb_index_transposed_tiles = tt::tt_metal::CreateCircularBuffer(program, core, index_transposed_cb_config);
 
     // topk values
-    uint32_t values_cb_index = tt::CB::c_intermed3;
+    uint32_t values_cb_index = tt::CBIndex::c_27;
     tt::tt_metal::CircularBufferConfig values_cb_config = tt::tt_metal::CircularBufferConfig(
         num_cb_unit * value_tile_size, {{values_cb_index, value_cb_data_format}})
         .set_page_size(values_cb_index, value_tile_size);
     auto cb_values_tensor = tt::tt_metal::CreateCircularBuffer(program, core, values_cb_config);
 
     // topk indices
-    uint32_t output_ind_cb_index = tt::CB::c_intermed4;
+    uint32_t output_ind_cb_index = tt::CBIndex::c_28;
     tt::tt_metal::CircularBufferConfig output_ind_cb_config = tt::tt_metal::CircularBufferConfig(
         num_cb_unit * index_tile_size, {{output_ind_cb_index, index_cb_data_format}})
         .set_page_size(output_ind_cb_index, index_tile_size);
     auto cb_output_ind_tensor = tt::tt_metal::CreateCircularBuffer(program, core, output_ind_cb_config);
 
-    uint32_t cb_cur_max_index = tt::CB::c_intermed5;
+    uint32_t cb_cur_max_index = tt::CBIndex::c_29;
     tt::tt_metal::CircularBufferConfig cb_cur_max_config = tt::tt_metal::CircularBufferConfig(
         num_out_tiles * out_tile_size, {{cb_cur_max_index, out_cb_data_format}})
         .set_page_size(cb_cur_max_index, out_tile_size);
     auto cb_cur_max_tensor = tt::tt_metal::CreateCircularBuffer(program, core, cb_cur_max_config);
 
-    uint32_t cb_cur_sum_index = tt::CB::c_intermed6;
+    uint32_t cb_cur_sum_index = tt::CBIndex::c_30;
     tt::tt_metal::CircularBufferConfig cb_cur_sum_config = tt::tt_metal::CircularBufferConfig(
         num_out_tiles * out_tile_size, {{cb_cur_sum_index, out_cb_data_format}})
         .set_page_size(cb_cur_sum_index, out_tile_size);
     auto cb_cur_sum_tensor = tt::tt_metal::CreateCircularBuffer(program, core, cb_cur_sum_config);
 
     // OUTPUT CBs
-    uint32_t out_cb_index = tt::CB::c_out0;
+    uint32_t out_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig c_out0_config = tt::tt_metal::CircularBufferConfig(
         num_out_tiles * out_tile_size, {{out_cb_index, out_cb_data_format}})
         .set_page_size(out_cb_index, out_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/compute/prod_all.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/compute/prod_all.cpp
index 9375737b75e..1dfc203c5f1 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/compute/prod_all.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/compute/prod_all.cpp
@@ -16,7 +16,7 @@ void MAIN {
     constexpr uint32_t num_tiles = get_compile_time_arg_val(0);
     constexpr uint32_t per_core_block_dim = get_compile_time_arg_val(1);
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_intermed0, tt::CB::c_out0);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_24, tt::CBIndex::c_16);
     bool last_tile = false;
     bool once = true;
     for (uint32_t t = 0; t < num_tiles; t++) {
@@ -24,48 +24,48 @@ void MAIN {
         {
             last_tile = true;
         }
-        cb_reserve_back(tt::CB::c_out0, 1);
+        cb_reserve_back(tt::CBIndex::c_16, 1);
         for(uint32_t tile_index = 0; tile_index < per_core_block_dim; ++tile_index) {
-            cb_wait_front(tt::CB::c_in0, 1);
+            cb_wait_front(tt::CBIndex::c_0, 1);
             if (once)
             {
-                cb_reserve_back(tt::CB::c_intermed0, 1);
+                cb_reserve_back(tt::CBIndex::c_24, 1);
                 tile_regs_acquire();
                 copy_tile_to_dst_init_short();
-                copy_tile(tt::CB::c_in0, 0, 0); // copy from c_in[0] to DST[0]
+                copy_tile(tt::CBIndex::c_0, 0, 0); // copy from c_in[0] to DST[0]
                 tile_regs_commit();
                 tile_regs_wait();
                 if constexpr (num_tiles == 1)
-                    pack_tile(0, tt::CB::c_out0);
+                    pack_tile(0, tt::CBIndex::c_16);
                 else
                 {
-                    pack_tile(0, tt::CB::c_intermed0);
-                    cb_push_back(tt::CB::c_intermed0, 1);
+                    pack_tile(0, tt::CBIndex::c_24);
+                    cb_push_back(tt::CBIndex::c_24, 1);
                 }
                 tile_regs_release();
             }else {
                 tile_regs_acquire();
                 mul_tiles_init();
-                mul_tiles(tt::CB::c_in0, tt::CB::c_intermed0, 0, 0, 0);
+                mul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_24, 0, 0, 0);
                 tile_regs_commit();
                 tile_regs_wait();
                 if (last_tile)
                 {
-                    pack_tile(0, tt::CB::c_out0);
+                    pack_tile(0, tt::CBIndex::c_16);
                 }
                 else
                 {
-                    cb_pop_front(tt::CB::c_intermed0, 1);
-                    cb_reserve_back(tt::CB::c_intermed0, 1);
-                    pack_tile(0, tt::CB::c_intermed0);
-                    cb_push_back(tt::CB::c_intermed0, 1);
+                    cb_pop_front(tt::CBIndex::c_24, 1);
+                    cb_reserve_back(tt::CBIndex::c_24, 1);
+                    pack_tile(0, tt::CBIndex::c_24);
+                    cb_push_back(tt::CBIndex::c_24, 1);
                 }
                 tile_regs_release();
             }
             once = false;
-            cb_pop_front(tt::CB::c_in0, 1);
+            cb_pop_front(tt::CBIndex::c_0, 1);
         }
-        cb_push_back(tt::CB::c_out0, 1);
+        cb_push_back(tt::CBIndex::c_16, 1);
 }
 }
 }
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/compute/prod_nc.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/compute/prod_nc.cpp
index a599e01631a..cba7ca887fb 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/compute/prod_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/compute/prod_nc.cpp
@@ -12,16 +12,16 @@ void MAIN {
     const auto num_input_tiles = get_arg_val<uint32_t>(0);
     const auto num_output_tiles = get_arg_val<uint32_t>(1);
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 = tt::CB::c_out0;
-    constexpr auto cb_intermed0 = tt::CB::c_intermed0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 = tt::CBIndex::c_16;
+    constexpr auto cb_intermed0 = tt::CBIndex::c_24;
     constexpr uint32_t onetile = 1;
     constexpr uint32_t dst0 = 0;
     constexpr uint32_t dst1 = 1;
     constexpr uint32_t first_tile = 0;
 
-    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in1);
+    binary_op_init_common(tt::CBIndex::c_0, tt::CBIndex::c_1);
     cb_wait_front(cb_in1, onetile);
 
     for (uint32_t i = 0; i < num_output_tiles; i++) {
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/dataflow/reader_prod_nc.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/dataflow/reader_prod_nc.cpp
index 29df2885ecd..4bbdab0fd1a 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/dataflow/reader_prod_nc.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/kernels/dataflow/reader_prod_nc.cpp
@@ -19,8 +19,8 @@ void kernel_main() {
     const auto dim = get_compile_time_arg_val(1);
 
     constexpr uint32_t onetile = 1;
-    constexpr uint32_t cb_id_in0 = 0;
-    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1;
 
     union {
         float f;
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
index 390dec0034a..584c61dd0be 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_all_program_factory.cpp
@@ -34,11 +34,11 @@ namespace primary {
             .set_page_size(src0_cb_index, single_tile_size);
         auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
 
-        tt_metal::CircularBufferConfig cb_inter_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{tt::CB::c_intermed0, cb_data_format}})
-            .set_page_size(tt::CB::c_intermed0, single_tile_size);
+        tt_metal::CircularBufferConfig cb_inter_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{tt::CBIndex::c_24, cb_data_format}})
+            .set_page_size(tt::CBIndex::c_24, single_tile_size);
         auto cb_interm = tt_metal::CreateCircularBuffer(program, core, cb_inter_config);
 
-        uint32_t output_cb_index = 16; // output operands start at index 16
+        uint32_t output_cb_index = tt::CBIndex::c_16;
         uint32_t num_output_tiles = 2;
         tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
             .set_page_size(output_cb_index, single_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp
index cb826642a7c..d9cdce35964 100644
--- a/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/prod/device/prod_nc_program_factory.cpp
@@ -9,6 +9,8 @@
 #include "tt_metal/host_api.hpp"
 #include "ttnn/operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace tt {
 using namespace constants;
 namespace operations {
@@ -78,10 +80,10 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor &input, const Tensor
         all_cores,
         cb_data_format,
         {
-            {CB::c_in0, in0_t},              // input
-            {CB::c_in1, in1_t},              // zero
-            {CB::c_intermed0, intermed0_t},  // accumulated sum
-            {CB::c_out0, out0_t},            // output
+            {CBIndex::c_0, in0_t},              // input
+            {CBIndex::c_1, in1_t},              // zero
+            {CBIndex::c_24, intermed0_t},  // accumulated sum
+            {CBIndex::c_16, out0_t},            // output
         });
 
     ////////////////////////////////////////////////////////////////////////////
@@ -93,7 +95,7 @@ operation::ProgramWithCallbacks prod_nc_format(const Tensor &input, const Tensor
     std::vector<uint32_t> reader_compile_time_args = {(std::uint32_t) input_is_dram, static_cast<uint32_t>(dim)};
 
     tt_metal::Buffer *output_buffer_type = output.buffer();
-    constexpr uint32_t cb_id_out = 16;
+    constexpr uint32_t cb_id_out = CBIndex::c_16;
     bool output_is_dram = output_buffer_type->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0;
     std::vector<uint32_t> writer_compile_time_args = {(std::uint32_t) cb_id_out, (std::uint32_t) output_is_dram};
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/compute/topk_final.cpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/compute/topk_final.cpp
index 57202fd8e56..7923047f0aa 100644
--- a/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/compute/topk_final.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/compute/topk_final.cpp
@@ -39,7 +39,7 @@ void MAIN {
     // init pack, compute and unpack
 
 
-    init_sfpu(input_cb_index);
+    init_sfpu(input_cb_index, tt::CBIndex::c_16);
     ckernel::topk_tile_init();
 
 
diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/dataflow/reader_final_topk.cpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/dataflow/reader_final_topk.cpp
index 7ac63d66021..5a1a57ea0e8 100644
--- a/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/dataflow/reader_final_topk.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/dataflow/reader_final_topk.cpp
@@ -19,8 +19,8 @@ void kernel_main() {
     constexpr uint32_t Wt_final = get_compile_time_arg_val(7);
     constexpr uint32_t num_dests = get_compile_time_arg_val(8);
 
-    constexpr uint32_t final_values_cb_index = tt::CB::c_intermed2;
-    constexpr uint32_t final_indices_cb_index = tt::CB::c_intermed3;
+    constexpr uint32_t final_values_cb_index = tt::CBIndex::c_26;
+    constexpr uint32_t final_indices_cb_index = tt::CBIndex::c_27;
 
     volatile tt_l1_ptr uint32_t* receiver_semaphore_addr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(receiver_semaphore);
     volatile tt_l1_ptr uint32_t* sender_semaphore_addr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(sender_semaphore);
diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/dataflow/writer_local_topk.cpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/dataflow/writer_local_topk.cpp
index a2514d7029c..d53f00a6e54 100644
--- a/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/dataflow/writer_local_topk.cpp
+++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/kernels/dataflow/writer_local_topk.cpp
@@ -17,16 +17,16 @@ void kernel_main() {
     uint32_t start_ht = get_arg_val<uint32_t>(0);
     uint32_t start_wt = get_arg_val<uint32_t>(1);
 
-    constexpr uint32_t values_cb_index = tt::CB::c_out0;
-    constexpr uint32_t output_ind_cb_index = tt::CB::c_out1;
+    constexpr uint32_t values_cb_index = tt::CBIndex::c_16;
+    constexpr uint32_t output_ind_cb_index = tt::CBIndex::c_17;
 
 
-    constexpr uint32_t topk_local_values_cb_index = tt::CB::c_intermed0;
-    constexpr uint32_t topk_local_indices_cb_index = tt::CB::c_intermed1;
+    constexpr uint32_t topk_local_values_cb_index = tt::CBIndex::c_24;
+    constexpr uint32_t topk_local_indices_cb_index = tt::CBIndex::c_25;
 
 
-    constexpr uint32_t final_values_cb_index = tt::CB::c_intermed2;
-    constexpr uint32_t final_indices_cb_index = tt::CB::c_intermed3;
+    constexpr uint32_t final_values_cb_index = tt::CBIndex::c_26;
+    constexpr uint32_t final_indices_cb_index = tt::CBIndex::c_27;
 
     // can amortize the noc reads by doing them side by side for the two tensors
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp
index 67a01da3103..8ade39a27e2 100644
--- a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp
+++ b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp
@@ -45,7 +45,7 @@ operation::ProgramWithCallbacks topk_single_core_interleaved(const Tensor &input
 
     // Two tiles are loaded in for topk_local_sort at a time, and we double buffer to avoid stalls, so allocate four tiles of space
     // TODO: In theory if we have enough memory we could allocate 2*Wt tiles to reduce stalls
-    uint32_t input_cb_index = tt::CB::c_in0;
+    uint32_t input_cb_index = tt::CBIndex::c_0;
     tt::tt_metal::CircularBufferConfig input_cb_config = tt::tt_metal::CircularBufferConfig(
         cb_in_units  * value_tile_size, {{input_cb_index, input_cb_data_format}})
 		.set_page_size(input_cb_index, input_tile_size);
@@ -53,28 +53,28 @@ operation::ProgramWithCallbacks topk_single_core_interleaved(const Tensor &input
 
     // Two tiles are loaded in for topk_local_sort at a time, and we double buffer to avoid stalls, so allocate four tiles of space
     // This CB carries the indices that are created in the reader kernel
-    uint32_t index_cb_index = tt::CB::c_in1;
+    uint32_t index_cb_index = tt::CBIndex::c_1;
     tt::tt_metal::CircularBufferConfig index_input_intermed0_config = tt::tt_metal::CircularBufferConfig(
         cb_in_units * index_tile_size, {{index_cb_index, index_cb_data_format}})
 		.set_page_size(index_cb_index, index_tile_size);
     auto cb_index_tensor = tt::tt_metal::CreateCircularBuffer(program, core, index_input_intermed0_config);
 
     // Single buffered circular buffer that holds the transposed input tiles
-    uint32_t input_transposed_cb_index = tt::CB::c_intermed0;
+    uint32_t input_transposed_cb_index = tt::CBIndex::c_24;
     tt::tt_metal::CircularBufferConfig input_transposed_cb_config = tt::tt_metal::CircularBufferConfig(
          Wt * value_tile_size, {{input_transposed_cb_index, input_cb_data_format}})
 		.set_page_size(input_transposed_cb_index, input_tile_size);
     auto cb_input_transposed_tiles = tt::tt_metal::CreateCircularBuffer(program, core, input_transposed_cb_config);
 
     // Single buffered circular buffer that holds the transposed index tiles
-    uint32_t index_transposed_cb_index = tt::CB::c_intermed1;
+    uint32_t index_transposed_cb_index = tt::CBIndex::c_25;
     tt::tt_metal::CircularBufferConfig index_transposed_cb_config = tt::tt_metal::CircularBufferConfig(
          Wt * index_tile_size, {{index_transposed_cb_index, index_cb_data_format}})
 		.set_page_size(index_transposed_cb_index, index_tile_size);
     auto cb_index_transposed_tiles = tt::tt_metal::CreateCircularBuffer(program, core, index_transposed_cb_config);
 
     // Output topk values
-    uint32_t values_cb_index = tt::CB::c_out0;
+    uint32_t values_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig values_cb_config = tt::tt_metal::CircularBufferConfig(
         num_cb_unit * value_tile_size, {{values_cb_index, value_cb_data_format}})
         .set_page_size(values_cb_index, value_tile_size);
@@ -82,7 +82,7 @@ operation::ProgramWithCallbacks topk_single_core_interleaved(const Tensor &input
 
 
     // Output topk indices
-    uint32_t output_ind_cb_index = tt::CB::c_out1;
+    uint32_t output_ind_cb_index = tt::CBIndex::c_17;
     tt::tt_metal::CircularBufferConfig output_ind_cb_config = tt::tt_metal::CircularBufferConfig(
         num_cb_unit * index_tile_size, {{output_ind_cb_index, index_cb_data_format}})
         .set_page_size(output_ind_cb_index, index_tile_size);
@@ -262,7 +262,7 @@ operation::ProgramWithCallbacks topk_multicore_interleaved(const Tensor &input_t
 
     // Two tiles are loaded in for topk_local_sort at a time, and we double buffer to avoid stalls, so allocate four tiles of space
     // TODO: In theory if we have enough memory we could allocate 2*Wt tiles to reduce stalls
-    uint32_t input_cb_index = tt::CB::c_in0;
+    uint32_t input_cb_index = tt::CBIndex::c_0;
     tt::tt_metal::CircularBufferConfig input_cb_config = tt::tt_metal::CircularBufferConfig(
         cb_in_units  * value_tile_size, {{input_cb_index, input_cb_data_format}})
 		.set_page_size(input_cb_index, input_tile_size);
@@ -270,52 +270,52 @@ operation::ProgramWithCallbacks topk_multicore_interleaved(const Tensor &input_t
 
     // Two tiles are loaded in for topk_local_sort at a time, and we double buffer to avoid stalls, so allocate four tiles of space
     // This CB carries the indices that are created in the reader kernel
-    uint32_t index_cb_index = tt::CB::c_in1;
+    uint32_t index_cb_index = tt::CBIndex::c_1;
     tt::tt_metal::CircularBufferConfig index_input_intermed0_config = tt::tt_metal::CircularBufferConfig(
         cb_in_units * index_tile_size, {{index_cb_index, index_cb_data_format}})
 		.set_page_size(index_cb_index, index_tile_size);
     auto cb_index_tensor = tt::tt_metal::CreateCircularBuffer(program, core, index_input_intermed0_config);
 
     // Single buffered circular buffer that holds the transposed input tiles
-    uint32_t input_transposed_cb_index = tt::CB::c_intermed0;
+    uint32_t input_transposed_cb_index = tt::CBIndex::c_24;
     tt::tt_metal::CircularBufferConfig input_transposed_cb_config = tt::tt_metal::CircularBufferConfig(
          Wt_local * value_tile_size, {{input_transposed_cb_index, input_cb_data_format}})
 		.set_page_size(input_transposed_cb_index, input_tile_size);
     auto cb_input_transposed_tiles = tt::tt_metal::CreateCircularBuffer(program, core, input_transposed_cb_config);
 
     // Single buffered circular buffer that holds the transposed index tiles
-    uint32_t index_transposed_cb_index = tt::CB::c_intermed1;
+    uint32_t index_transposed_cb_index = tt::CBIndex::c_25;
     tt::tt_metal::CircularBufferConfig index_transposed_cb_config = tt::tt_metal::CircularBufferConfig(
          Wt_local * index_tile_size, {{index_transposed_cb_index, index_cb_data_format}})
 		.set_page_size(index_transposed_cb_index, index_tile_size);
     auto cb_index_transposed_tiles = tt::tt_metal::CreateCircularBuffer(program, core, index_transposed_cb_config);
 
-    uint32_t gathered_values_cb_index = tt::CB::c_intermed2;
+    uint32_t gathered_values_cb_index = tt::CBIndex::c_26;
     tt::tt_metal::CircularBufferConfig gathered_values_cb_config = tt::tt_metal::CircularBufferConfig(
         Wt_final * value_tile_size, {{gathered_values_cb_index, value_cb_data_format}})
         .set_page_size(gathered_values_cb_index, value_tile_size);
     auto cb_gathered_topk_values_tensor = tt::tt_metal::CreateCircularBuffer(program, core, gathered_values_cb_config);
 
-    uint32_t gathered_indices_cb_index = tt::CB::c_intermed3;
+    uint32_t gathered_indices_cb_index = tt::CBIndex::c_27;
     tt::tt_metal::CircularBufferConfig gathered_indices_cb_config = tt::tt_metal::CircularBufferConfig(
         Wt_final * index_tile_size, {{gathered_indices_cb_index, index_cb_data_format}})
         .set_page_size(gathered_indices_cb_index, index_tile_size);
     auto cb_gathered_topk_indices_tensor = tt::tt_metal::CreateCircularBuffer(program, core, gathered_indices_cb_config);
 
-    uint32_t final_values_cb_index = tt::CB::c_intermed4;
+    uint32_t final_values_cb_index = tt::CBIndex::c_28;
     tt::tt_metal::CircularBufferConfig final_values_cb_config = tt::tt_metal::CircularBufferConfig(
         Wt_final * value_tile_size, {{final_values_cb_index, value_cb_data_format}})
         .set_page_size(final_values_cb_index, value_tile_size);
     auto cb_final_topk_values_tensor = tt::tt_metal::CreateCircularBuffer(program, core, final_values_cb_config);
 
-    uint32_t final_indices_cb_index = tt::CB::c_intermed5;
+    uint32_t final_indices_cb_index = tt::CBIndex::c_29;
     tt::tt_metal::CircularBufferConfig final_indices_cb_config = tt::tt_metal::CircularBufferConfig(
         Wt_final * index_tile_size, {{final_indices_cb_index, index_cb_data_format}})
         .set_page_size(final_indices_cb_index, index_tile_size);
     auto cb_final_topk_index_tensor = tt::tt_metal::CreateCircularBuffer(program, core, final_indices_cb_config);
 
     // Output topk values
-    uint32_t values_cb_index = tt::CB::c_out0;
+    uint32_t values_cb_index = tt::CBIndex::c_16;
     tt::tt_metal::CircularBufferConfig values_cb_config = tt::tt_metal::CircularBufferConfig(
         num_cb_unit * value_tile_size, {{values_cb_index, value_cb_data_format}})
         .set_page_size(values_cb_index, value_tile_size);
@@ -323,7 +323,7 @@ operation::ProgramWithCallbacks topk_multicore_interleaved(const Tensor &input_t
 
 
     // Output topk indices
-    uint32_t output_ind_cb_index = tt::CB::c_out1;
+    uint32_t output_ind_cb_index = tt::CBIndex::c_17;
     tt::tt_metal::CircularBufferConfig output_ind_cb_config = tt::tt_metal::CircularBufferConfig(
         num_cb_unit * index_tile_size, {{output_ind_cb_index, index_cb_data_format}})
         .set_page_size(output_ind_cb_index, index_tile_size);
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
index 06e8a52e7b5..83472c30cc6 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/reference_sliding_window.cpp
@@ -12,6 +12,8 @@
 #include "ttnn/tensor/host_buffer/functions.hpp"
 #include "ttnn/tensor/host_buffer/types.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::sliding_window {
 
 owned_buffer::Buffer<bfloat16> ref_conv_op(
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
index ac3e5c9eecd..c8edb6a9596 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.cpp
@@ -6,6 +6,8 @@
 #include <vector>
 #include "tt_metal/common/assert.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::sliding_window{
 std::size_t SlidingWindowConfig::get_hash() const {
     return std::hash<std::string>{}(to_string());
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp
index 8de71ee0c8a..248251fd0ed 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window.hpp
@@ -15,8 +15,8 @@ namespace ttnn::operations::sliding_window {
 
 struct ParallelConfig {
     CoreRangeSet grid = {};
-    TensorMemoryLayout shard_scheme;
-    ShardOrientation shard_orientation;
+    tt::tt_metal::TensorMemoryLayout shard_scheme;
+    tt::tt_metal::ShardOrientation shard_orientation;
 
     bool operator==(const ParallelConfig &other) {
         return (grid == other.grid && shard_scheme == other.shard_scheme && shard_orientation == other.shard_orientation);
@@ -86,11 +86,11 @@ std::vector<uint32_t> generate_op_trace_metadata(const SlidingWindowConfig& conf
 std::vector<std::pair<uint32_pair_t, uint32_pair_t>> generate_shard_boundaries(const SlidingWindowConfig& config, const std::vector<uint32_t>& op_trace_metadata);
 std::vector<std::pair<bool, uint32_pair_t>> generate_tensor_metadata(const std::vector<bool>& pad_metadata, const SlidingWindowConfig& config, uint32_t reshard_num_cores_nhw = 0, bool is_in_tiled = true);
 uint32_t generate_max_out_nsticks_per_core(const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries);
-std::tuple<std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>> generate_halo_kernel_config_tensors(const std::vector<std::pair<bool, uint32_pair_t>>& tensor_metadata, const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries, bool is_block_sharded, bool transpose_mcast, bool remote_read, Device* device);
+std::tuple<std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>> generate_halo_kernel_config_tensors(const std::vector<std::pair<bool, uint32_pair_t>>& tensor_metadata, const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries, bool is_block_sharded, bool transpose_mcast, bool remote_read, tt::tt_metal::Device* device);
 std::vector<std::vector<uint16_t>> generate_sliding_window_op_config(const std::vector<uint32_t>& op_trace_metadata, const std::vector<std::pair<uint32_pair_t, uint32_pair_t>>& shard_boundaries, bool pad_tile = false, bool pad_cores = false);
 std::vector<uint16_t> flatten(const std::vector<std::vector<uint16_t>>& input);
 Tensor construct_on_host_config_tensor(const std::vector<std::vector<uint16_t>>& config, const SlidingWindowConfig& sw_config, const ParallelConfig& p_config);
-Tensor move_config_tensor_to_device(const Tensor& config_tensor, const ParallelConfig& p_config, bool is_block_sharded, Device* device);
+Tensor move_config_tensor_to_device(const Tensor& config_tensor, const ParallelConfig& p_config, bool is_block_sharded, tt::tt_metal::Device* device);
 
 } // namespace ttnn::operations::sliding_window
 
diff --git a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.cpp b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.cpp
index 3206f2b6483..aee2d973d4c 100644
--- a/ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/sliding_window/sliding_window_pybind.cpp
@@ -5,6 +5,8 @@
 #include "ttnn/cpp/pybind11/decorators.hpp"
 #include "sliding_window.hpp"
 
+using namespace tt::tt_metal;
+
 namespace py = pybind11;
 namespace ttnn::operations::sliding_window {
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads.cpp b/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads.cpp
index 95d7c8864bf..fbba9b4d17d 100644
--- a/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/concatenate_heads/concatenate_heads.cpp
@@ -7,6 +7,8 @@
 
 #include "ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/nlp_concat_heads_device_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::transformer {
 
 struct ConcatenateHeads : public ttnn::operations::experimental::transformer::NLPConcatHeadsDeviceOperation {
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp
index 761ef786168..be7ce29bc95 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/compute/sdpa.cpp
@@ -362,23 +362,23 @@ void MAIN {
     constexpr uint32_t qk_chunk_tiles = Sq_chunk_t * Sk_chunk_t;
     constexpr uint32_t out_chunk_tiles = Sq_chunk_t * DHt;
 
-    constexpr uint32_t cb_q_in = tt::CB::c_in0;
-    constexpr uint32_t cb_k_in = tt::CB::c_in1;
-    constexpr uint32_t cb_v_in = tt::CB::c_in2;
-    constexpr uint32_t cb_mask_in = tt::CB::c_in3;
-    constexpr uint32_t cb_scale_in = tt::CB::c_in4;
-    constexpr uint32_t cb_identity_scale_in = tt::CB::c_in5;
-
-    constexpr uint32_t cb_qk_im = tt::CB::c_intermed0;
-    constexpr uint32_t cb_out_im = tt::CB::c_intermed1;
-    constexpr uint32_t cb_out_accumulate_im = tt::CB::c_intermed2;
-    constexpr uint32_t cb_cur_max = tt::CB::c_intermed3;
-    constexpr uint32_t cb_prev_max = tt::CB::c_intermed4;
-    constexpr uint32_t cb_cur_sum = tt::CB::c_intermed5;
-    constexpr uint32_t cb_prev_sum = tt::CB::c_intermed6;
-    constexpr uint32_t cb_exp_max_diff = tt::CB::c_intermed7;
-
-    constexpr uint32_t cb_out = tt::CB::c_out0;
+    constexpr uint32_t cb_q_in = tt::CBIndex::c_0;
+    constexpr uint32_t cb_k_in = tt::CBIndex::c_1;
+    constexpr uint32_t cb_v_in = tt::CBIndex::c_2;
+    constexpr uint32_t cb_mask_in = tt::CBIndex::c_3;
+    constexpr uint32_t cb_scale_in = tt::CBIndex::c_4;
+    constexpr uint32_t cb_identity_scale_in = tt::CBIndex::c_5;
+
+    constexpr uint32_t cb_qk_im = tt::CBIndex::c_24;
+    constexpr uint32_t cb_out_im = tt::CBIndex::c_25;
+    constexpr uint32_t cb_out_accumulate_im = tt::CBIndex::c_26;
+    constexpr uint32_t cb_cur_max = tt::CBIndex::c_27;
+    constexpr uint32_t cb_prev_max = tt::CBIndex::c_28;
+    constexpr uint32_t cb_cur_sum = tt::CBIndex::c_29;
+    constexpr uint32_t cb_prev_sum = tt::CBIndex::c_30;
+    constexpr uint32_t cb_exp_max_diff = tt::CBIndex::c_31;
+
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
 
 
     mm_init();
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/reader_interleaved.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/reader_interleaved.cpp
index b57c4aa81c3..6e963d0aef9 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/reader_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/reader_interleaved.cpp
@@ -45,10 +45,10 @@ void kernel_main() {
 
     constexpr bool is_dram = true;
 
-    constexpr uint32_t cb_q_in = tt::CB::c_in0;
-    constexpr uint32_t cb_k_in = tt::CB::c_in1;
-    constexpr uint32_t cb_v_in = tt::CB::c_in2;
-    constexpr uint32_t cb_mask_in = tt::CB::c_in3;
+    constexpr uint32_t cb_q_in = tt::CBIndex::c_0;
+    constexpr uint32_t cb_k_in = tt::CBIndex::c_1;
+    constexpr uint32_t cb_v_in = tt::CBIndex::c_2;
+    constexpr uint32_t cb_mask_in = tt::CBIndex::c_3;
 
 
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp
index f015ad85161..7edacf49769 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/kernels/dataflow/writer_interleaved.cpp
@@ -162,8 +162,8 @@ void kernel_main() {
     constexpr uint32_t out_chunk_tiles = Sq_chunk_t * DHt;
 
     constexpr bool is_dram = true;
-    constexpr uint32_t cb_out = tt::CB::c_out0;
-    constexpr uint32_t cb_mask_in = tt::CB::c_in3;
+    constexpr uint32_t cb_out = tt::CBIndex::c_16;
+    constexpr uint32_t cb_mask_in = tt::CBIndex::c_3;
 
     constexpr uint32_t tile_bytes = get_tile_size(cb_out);
     constexpr DataFormat data_format = get_dataformat(cb_out);
@@ -177,8 +177,8 @@ void kernel_main() {
     constexpr uint32_t barrier_threshold = get_barrier_read_threshold<tile_bytes, num_cores>();
     uint32_t barrier_count = 0;
 
-    constexpr uint32_t cb_scale_in = tt::CB::c_in4;
-    constexpr uint32_t cb_identity_scale_in = tt::CB::c_in5;
+    constexpr uint32_t cb_scale_in = tt::CBIndex::c_4;
+    constexpr uint32_t cb_identity_scale_in = tt::CBIndex::c_5;
 
     generate_bcast_unary_scalar(cb_scale_in, scale_val);
     generate_reduce_scaler(cb_identity_scale_in, identity_scalar_packed);
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.cpp
index c8bb822e26e..7818abf812c 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_op.cpp
@@ -7,6 +7,8 @@
 #include "sdpa_program_factory.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::transformer {
 
 void ScaledDotProductAttention::validate(
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp
index 5d9782d2320..9e667d912b1 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/device/sdpa_program_factory.cpp
@@ -15,6 +15,7 @@
 #include "ttnn/operation.hpp"
 
 using namespace tt::constants;
+using namespace tt::tt_metal;
 
 namespace ttnn::operations::transformer::detail {
 
@@ -343,76 +344,76 @@ operation::ProgramWithCallbacks sdpa_multi_core(
 
     // Q input
     auto c_in0_config =
-        CircularBufferConfig(q_tiles * q_tile_size, {{tt::CB::c_in0, q_df}}).set_page_size(tt::CB::c_in0, q_tile_size);
+        CircularBufferConfig(q_tiles * q_tile_size, {{tt::CBIndex::c_0, q_df}}).set_page_size(tt::CBIndex::c_0, q_tile_size);
 
     auto cb_in0_id = CreateCircularBuffer(program, core_grid, c_in0_config);
     // K input
     auto c_in1_config =
-        CircularBufferConfig(k_tiles * k_tile_size, {{tt::CB::c_in1, k_df}}).set_page_size(tt::CB::c_in1, k_tile_size);
+        CircularBufferConfig(k_tiles * k_tile_size, {{tt::CBIndex::c_1, k_df}}).set_page_size(tt::CBIndex::c_1, k_tile_size);
     auto cb_in1_id = CreateCircularBuffer(program, core_grid, c_in1_config);
     // V input
     auto c_in2_config =
-        CircularBufferConfig(v_tiles * v_tile_size, {{tt::CB::c_in2, v_df}}).set_page_size(tt::CB::c_in2, v_tile_size);
+        CircularBufferConfig(v_tiles * v_tile_size, {{tt::CBIndex::c_2, v_df}}).set_page_size(tt::CBIndex::c_2, v_tile_size);
     auto cb_in2_id = CreateCircularBuffer(program, core_grid, c_in2_config);
 
     // attn_mask input
-    auto c_in3_config = CircularBufferConfig(mask_tiles * mask_tile_size, {{tt::CB::c_in3, mask_df}})
-                            .set_page_size(tt::CB::c_in3, mask_tile_size);
+    auto c_in3_config = CircularBufferConfig(mask_tiles * mask_tile_size, {{tt::CBIndex::c_3, mask_df}})
+                            .set_page_size(tt::CBIndex::c_3, mask_tile_size);
     auto cb_in3_id = CreateCircularBuffer(program, core_grid, c_in3_config);
 
     // scale input
-    auto c_in4_config = CircularBufferConfig(scale_tiles * scalar_tile_size, {{tt::CB::c_in4, scalar_df}})
-                            .set_page_size(tt::CB::c_in4, scalar_tile_size);
+    auto c_in4_config = CircularBufferConfig(scale_tiles * scalar_tile_size, {{tt::CBIndex::c_4, scalar_df}})
+                            .set_page_size(tt::CBIndex::c_4, scalar_tile_size);
     auto cb_in4_id = CreateCircularBuffer(program, core_grid, c_in4_config);
 
     // identity scale input
-    auto c_in5_config = CircularBufferConfig(scale_tiles * scalar_tile_size, {{tt::CB::c_in5, scalar_df}})
-                            .set_page_size(tt::CB::c_in5, scalar_tile_size);
+    auto c_in5_config = CircularBufferConfig(scale_tiles * scalar_tile_size, {{tt::CBIndex::c_5, scalar_df}})
+                            .set_page_size(tt::CBIndex::c_5, scalar_tile_size);
     auto cb_in5_id = CreateCircularBuffer(program, core_grid, c_in5_config);
 
     // cb_qk_im
-    auto c_intermed0_config = CircularBufferConfig(qk_tiles * im_tile_size, {{tt::CB::c_intermed0, im_df}})
-                                  .set_page_size(tt::CB::c_intermed0, im_tile_size);
+    auto c_intermed0_config = CircularBufferConfig(qk_tiles * im_tile_size, {{tt::CBIndex::c_24, im_df}})
+                                  .set_page_size(tt::CBIndex::c_24, im_tile_size);
     auto cb_intermed0_id = CreateCircularBuffer(program, core_grid, c_intermed0_config);
 
     // cb_out_im
-    auto c_intermed1_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{tt::CB::c_intermed1, im_df}})
-                                  .set_page_size(tt::CB::c_intermed1, im_tile_size);
+    auto c_intermed1_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{tt::CBIndex::c_25, im_df}})
+                                  .set_page_size(tt::CBIndex::c_25, im_tile_size);
     auto cb_intermed1_id = CreateCircularBuffer(program, core_grid, c_intermed1_config);
 
     // cb_out_accumulate_im
-    auto c_intermed2_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{tt::CB::c_intermed2, im_df}})
-                                  .set_page_size(tt::CB::c_intermed2, im_tile_size);
+    auto c_intermed2_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{tt::CBIndex::c_26, im_df}})
+                                  .set_page_size(tt::CBIndex::c_26, im_tile_size);
     auto cb_intermed2_id = CreateCircularBuffer(program, core_grid, c_intermed2_config);
 
     // cb_cur_max
-    auto c_intermed3_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CB::c_intermed3, stats_df}})
-                                  .set_page_size(tt::CB::c_intermed3, stats_tile_size);
+    auto c_intermed3_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CBIndex::c_27, stats_df}})
+                                  .set_page_size(tt::CBIndex::c_27, stats_tile_size);
     auto cb_intermed3_id = CreateCircularBuffer(program, core_grid, c_intermed3_config);
 
     // cb_prev_max
-    auto c_intermed4_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CB::c_intermed4, stats_df}})
-                                  .set_page_size(tt::CB::c_intermed4, stats_tile_size);
+    auto c_intermed4_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CBIndex::c_28, stats_df}})
+                                  .set_page_size(tt::CBIndex::c_28, stats_tile_size);
     auto cb_intermed4_id = CreateCircularBuffer(program, core_grid, c_intermed4_config);
 
     // cb_cur_sum
-    auto c_intermed5_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CB::c_intermed5, stats_df}})
-                                  .set_page_size(tt::CB::c_intermed5, stats_tile_size);
+    auto c_intermed5_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CBIndex::c_29, stats_df}})
+                                  .set_page_size(tt::CBIndex::c_29, stats_tile_size);
     auto cb_intermed5_id = CreateCircularBuffer(program, core_grid, c_intermed5_config);
 
     // cb_prev_sum
-    auto c_intermed6_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CB::c_intermed6, stats_df}})
-                                  .set_page_size(tt::CB::c_intermed6, stats_tile_size);
+    auto c_intermed6_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CBIndex::c_30, stats_df}})
+                                  .set_page_size(tt::CBIndex::c_30, stats_tile_size);
     auto cb_intermed6_id = CreateCircularBuffer(program, core_grid, c_intermed6_config);
 
     // cb_exp_max_diff
-    auto c_intermed7_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CB::c_intermed7, stats_df}})
-                                  .set_page_size(tt::CB::c_intermed7, stats_tile_size);
+    auto c_intermed7_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{tt::CBIndex::c_31, stats_df}})
+                                  .set_page_size(tt::CBIndex::c_31, stats_tile_size);
     auto cb_intermed7_id = CreateCircularBuffer(program, core_grid, c_intermed7_config);
 
     // Output
     auto c_out0_config =
-        CircularBufferConfig(out0_t * out_tile_size, {{tt::CB::c_out0, out_df}}).set_page_size(tt::CB::c_out0, out_tile_size);
+        CircularBufferConfig(out0_t * out_tile_size, {{tt::CBIndex::c_16, out_df}}).set_page_size(tt::CBIndex::c_16, out_tile_size);
 
     auto cb_out0_id = CreateCircularBuffer(program, core_grid, c_out0_config);
 
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp
index ee3341cc05b..25e0f8668b6 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa/sdpa.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::transformer {
 
 ttnn::Tensor ExecuteScaledDotProductAttention::invoke(
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp
index d21dc41a84f..15d0f919128 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp
@@ -383,31 +383,31 @@ void MAIN {
     constexpr uint32_t qk_chunk_tiles = Sq_chunk_t * Sk_chunk_t;
     constexpr uint32_t out_chunk_tiles = Sq_chunk_t * DHt;
 
-    constexpr uint32_t cb_q_in = tt::CB::c_in0;  // reuse it also for reduce input o
-    constexpr uint32_t cb_k_in = tt::CB::c_in1;
-    constexpr uint32_t cb_v_in = tt::CB::c_in2;
-    constexpr uint32_t cb_mask_in = tt::CB::c_in3;
-    constexpr uint32_t cb_scale_in = tt::CB::c_in4;
-    constexpr uint32_t cb_identity_scale_in = tt::CB::c_in5;
-    constexpr uint32_t cb_m_in = tt::CB::c_in6;
-    constexpr uint32_t cb_l_in = tt::CB::c_in7;
-
-    constexpr uint32_t cb_qk_im = tt::CB::c_intermed0;
-    constexpr uint32_t cb_out_im = tt::CB::c_intermed1;
-    constexpr uint32_t cb_out_accumulate_im = tt::CB::c_intermed2;
-    constexpr uint32_t cb_cur_max = tt::CB::c_intermed3;
-    constexpr uint32_t cb_prev_max = tt::CB::c_intermed4;
-    constexpr uint32_t cb_cur_sum = tt::CB::c_intermed5;
-    constexpr uint32_t cb_prev_sum = tt::CB::c_intermed6;
-    constexpr uint32_t cb_exp_max_diff = tt::CB::c_intermed7;
-    constexpr uint32_t cb_prev_sum_2 = tt::CB::c_out5;
-    constexpr uint32_t cb_exp_max_diff_2 = tt::CB::c_out6;
-    constexpr uint32_t cb_out_accumulate_im_2 = tt::CB::c_out7;
-
-    constexpr uint32_t cb_out_o = tt::CB::c_out0;
-    constexpr uint32_t cb_out_m = tt::CB::c_out1;
-    constexpr uint32_t cb_out_l = tt::CB::c_out2;
-    constexpr uint32_t cb_out_final = tt::CB::c_out4;
+    constexpr uint32_t cb_q_in = tt::CBIndex::c_0;  // reuse it also for reduce input o
+    constexpr uint32_t cb_k_in = tt::CBIndex::c_1;
+    constexpr uint32_t cb_v_in = tt::CBIndex::c_2;
+    constexpr uint32_t cb_mask_in = tt::CBIndex::c_3;
+    constexpr uint32_t cb_scale_in = tt::CBIndex::c_4;
+    constexpr uint32_t cb_identity_scale_in = tt::CBIndex::c_5;
+    constexpr uint32_t cb_m_in = tt::CBIndex::c_6;
+    constexpr uint32_t cb_l_in = tt::CBIndex::c_7;
+
+    constexpr uint32_t cb_qk_im = tt::CBIndex::c_24;
+    constexpr uint32_t cb_out_im = tt::CBIndex::c_25;
+    constexpr uint32_t cb_out_accumulate_im = tt::CBIndex::c_26;
+    constexpr uint32_t cb_cur_max = tt::CBIndex::c_27;
+    constexpr uint32_t cb_prev_max = tt::CBIndex::c_28;
+    constexpr uint32_t cb_cur_sum = tt::CBIndex::c_29;
+    constexpr uint32_t cb_prev_sum = tt::CBIndex::c_30;
+    constexpr uint32_t cb_exp_max_diff = tt::CBIndex::c_31;
+    constexpr uint32_t cb_prev_sum_2 = tt::CBIndex::c_21;
+    constexpr uint32_t cb_exp_max_diff_2 = tt::CBIndex::c_22;
+    constexpr uint32_t cb_out_accumulate_im_2 = tt::CBIndex::c_23;
+
+    constexpr uint32_t cb_out_o = tt::CBIndex::c_16;
+    constexpr uint32_t cb_out_m = tt::CBIndex::c_17;
+    constexpr uint32_t cb_out_l = tt::CBIndex::c_18;
+    constexpr uint32_t cb_out_final = tt::CBIndex::c_20;
 
     uint32_t arg_idx = 0;
     const bool do_reduce = get_arg_val<uint32_t>(arg_idx++) == 1;
@@ -433,7 +433,7 @@ void MAIN {
             cur_pos = cur_pos_arg;
         }
         else {
-            constexpr uint32_t cb_index_id = tt::CB::dataflow0;
+            constexpr uint32_t cb_index_id = tt::CBIndex::c_8;
             cb_wait_front(cb_index_id, 1);
             volatile uint32_t *index_addr_ptr;
             cb_get_tile(cb_index_id, 0, &index_addr_ptr);
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp
index 5e82ebcfa62..ab070a95f94 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/reader_decode_all.cpp
@@ -107,7 +107,7 @@ void kernel_main() {
             cur_pos = cur_pos_arg;
         }
         else {
-            constexpr uint32_t cb_index_id = tt::CB::dataflow0;
+            constexpr uint32_t cb_index_id = tt::CBIndex::c_8;
             const InterleavedAddrGen<true> addrg = {
                     .bank_base_address = pos_addr,
                     .page_size = index_stick_size_B
@@ -133,7 +133,7 @@ void kernel_main() {
 
     volatile tt_l1_ptr uint32_t* page_table_ptr;
     if constexpr (is_paged_attention) {
-        constexpr uint32_t cb_id_page_table = tt::CB::dataflow1;
+        constexpr uint32_t cb_id_page_table = tt::CBIndex::c_9;
         const InterleavedAddrGen<true> page_table_gen = {
             .bank_base_address = page_table_addr,
             .page_size = page_table_page_size
@@ -165,10 +165,10 @@ void kernel_main() {
 
     constexpr bool is_dram = true;
 
-    constexpr uint32_t cb_q_in = tt::CB::c_in0;
-    constexpr uint32_t cb_k_in = tt::CB::c_in1;
-    constexpr uint32_t cb_v_in = tt::CB::c_in2;
-    constexpr uint32_t cb_mask_in = tt::CB::c_in3;
+    constexpr uint32_t cb_q_in = tt::CBIndex::c_0;
+    constexpr uint32_t cb_k_in = tt::CBIndex::c_1;
+    constexpr uint32_t cb_v_in = tt::CBIndex::c_2;
+    constexpr uint32_t cb_mask_in = tt::CBIndex::c_3;
 
 
     constexpr uint32_t onetile = 1;
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp
index 4059ad84736..0fe7f5cb81e 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp
@@ -271,7 +271,7 @@ void kernel_main() {
             cur_pos = cur_pos_arg;
         }
         else {
-            constexpr uint32_t cb_index_id = tt::CB::dataflow0;
+            constexpr uint32_t cb_index_id = tt::CBIndex::c_8;
             cb_wait_front(cb_index_id, 1);
             uint32_t index_cb_ptr = get_read_ptr(cb_index_id);
             volatile tt_l1_ptr uint32_t* index_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(index_cb_ptr);
@@ -310,19 +310,19 @@ void kernel_main() {
     uint32_t num_tiles_to_wait = (out_chunk_tiles+2*PNHt)*num_cores_to_wait;
 
     constexpr bool is_dram = true;
-    constexpr uint32_t cb_out = tt::CB::c_out4;
-    constexpr uint32_t cb_intermed_out = tt::CB::c_out3;  // this cb holds the output intermediates from other worker cores
-    constexpr uint32_t cb_out_o = tt::CB::c_out0;
-    constexpr uint32_t cb_m_in = tt::CB::c_in6;
-    constexpr uint32_t cb_l_in = tt::CB::c_in7;
-
-    constexpr uint32_t cb_mask_in = tt::CB::c_in3;
-    constexpr uint32_t cb_scale_in = tt::CB::c_in4;
-    constexpr uint32_t cb_identity_scale_in = tt::CB::c_in5;
-
-    constexpr uint32_t cb_out_worker = tt::CB::c_out0;
-    constexpr uint32_t cb_out_m = tt::CB::c_out1;
-    constexpr uint32_t cb_out_l = tt::CB::c_out2;
+    constexpr uint32_t cb_out = tt::CBIndex::c_20;
+    constexpr uint32_t cb_intermed_out = tt::CBIndex::c_19;  // this cb holds the output intermediates from other worker cores
+    constexpr uint32_t cb_out_o = tt::CBIndex::c_16;
+    constexpr uint32_t cb_m_in = tt::CBIndex::c_6;
+    constexpr uint32_t cb_l_in = tt::CBIndex::c_7;
+
+    constexpr uint32_t cb_mask_in = tt::CBIndex::c_3;
+    constexpr uint32_t cb_scale_in = tt::CBIndex::c_4;
+    constexpr uint32_t cb_identity_scale_in = tt::CBIndex::c_5;
+
+    constexpr uint32_t cb_out_worker = tt::CBIndex::c_16;
+    constexpr uint32_t cb_out_m = tt::CBIndex::c_17;
+    constexpr uint32_t cb_out_l = tt::CBIndex::c_18;
 
     // generate and send scaler to compute
     generate_bcast_unary_scalar(cb_scale_in, scale_val);
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_op.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_op.cpp
index efb7ee090bc..ab05750a7ad 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_op.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_op.cpp
@@ -7,6 +7,8 @@
 #include "sdpa_decode_program_factory.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::transformer {
 
 void ScaledDotProductAttentionDecode::validate(const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors) const {
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp
index 841bd66a20b..3a6bdebf86b 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/sdpa_decode_program_factory.cpp
@@ -14,8 +14,10 @@
 #include "tt_metal/host_api.hpp"
 #include "ttnn/operation.hpp"
 
-using namespace tt::constants;
 using namespace tt;
+using namespace tt::constants;
+using namespace tt::tt_metal;
+
 namespace ttnn::operations::transformer::detail {
 
 // implementation of softmax with optional scale/mask (see the header for input_tensor more detailed description)
@@ -299,7 +301,7 @@ operation::ProgramWithCallbacks sdpa_decode_multi_core(
         index_stick_size = pos_buffer->aligned_page_size();
 
         //cb pos
-        auto c_in8_config = CircularBufferConfig(pos_tensor_tile_size, {{CB::dataflow0, pos_df}}).set_page_size(CB::dataflow0, pos_tensor_tile_size);
+        auto c_in8_config = CircularBufferConfig(pos_tensor_tile_size, {{CBIndex::c_8, pos_df}}).set_page_size(CBIndex::c_8, pos_tensor_tile_size);
         auto cb_in8_id = CreateCircularBuffer(program, core_grid, c_in8_config);
 
     }
@@ -314,7 +316,7 @@ operation::ProgramWithCallbacks sdpa_decode_multi_core(
         page_table_stick_size = page_table_buffer->aligned_page_size();
 
         //cb page_table
-        auto c_in9_config = CircularBufferConfig(page_table_tile_size, {{CB::dataflow1, page_table_df}}).set_page_size(CB::dataflow1, page_table_tile_size);
+        auto c_in9_config = CircularBufferConfig(page_table_tile_size, {{CBIndex::c_9, page_table_df}}).set_page_size(CBIndex::c_9, page_table_tile_size);
         auto cb_in9_id = CreateCircularBuffer(program, core_grid, c_in9_config);
     }
 
@@ -329,103 +331,103 @@ operation::ProgramWithCallbacks sdpa_decode_multi_core(
 
     // CBs
     // Q input
-    auto c_in0_config = CircularBufferConfig(q_tiles * q_tile_size, {{CB::c_in0, q_df}}).set_page_size(CB::c_in0, q_tile_size);
+    auto c_in0_config = CircularBufferConfig(q_tiles * q_tile_size, {{CBIndex::c_0, q_df}}).set_page_size(CBIndex::c_0, q_tile_size);
     auto cb_in0_id = CreateCircularBuffer(program, core_grid, c_in0_config);
 
     // K input
-    auto c_in1_config = CircularBufferConfig(k_tiles * k_tile_size, {{CB::c_in1, k_df}}).set_page_size(CB::c_in1, k_tile_size);
+    auto c_in1_config = CircularBufferConfig(k_tiles * k_tile_size, {{CBIndex::c_1, k_df}}).set_page_size(CBIndex::c_1, k_tile_size);
     auto cb_in1_id = CreateCircularBuffer(program, core_grid, c_in1_config);
 
     // V input
-    auto c_in2_config = CircularBufferConfig(v_tiles * v_tile_size, {{CB::c_in2, v_df}}).set_page_size(CB::c_in2, v_tile_size);
+    auto c_in2_config = CircularBufferConfig(v_tiles * v_tile_size, {{CBIndex::c_2, v_df}}).set_page_size(CBIndex::c_2, v_tile_size);
     auto cb_in2_id = CreateCircularBuffer(program, core_grid, c_in2_config);
 
     // attn_mask input
-    auto c_in3_config = CircularBufferConfig(qk_tiles * mask_tile_size, {{CB::c_in3, mask_df}}).set_page_size(CB::c_in3, mask_tile_size);
+    auto c_in3_config = CircularBufferConfig(qk_tiles * mask_tile_size, {{CBIndex::c_3, mask_df}}).set_page_size(CBIndex::c_3, mask_tile_size);
     auto cb_in3_id = CreateCircularBuffer(program, core_grid, c_in3_config);
 
     // scale input
-    auto c_in4_config = CircularBufferConfig(scale_tiles * scalar_tile_size, {{CB::c_in4, scalar_df}}).set_page_size(CB::c_in4, scalar_tile_size);
+    auto c_in4_config = CircularBufferConfig(scale_tiles * scalar_tile_size, {{CBIndex::c_4, scalar_df}}).set_page_size(CBIndex::c_4, scalar_tile_size);
     auto cb_in4_id = CreateCircularBuffer(program, core_grid, c_in4_config);
 
     // identity scale input
-    auto c_in5_config = CircularBufferConfig(scale_tiles * scalar_tile_size, {{CB::c_in5, scalar_df}}).set_page_size(CB::c_in5, scalar_tile_size);
+    auto c_in5_config = CircularBufferConfig(scale_tiles * scalar_tile_size, {{CBIndex::c_5, scalar_df}}).set_page_size(CBIndex::c_5, scalar_tile_size);
     auto cb_in5_id = CreateCircularBuffer(program, core_grid, c_in5_config);
 
     // cb_m_in
-    auto c_in6_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_in6, stats_df}}).set_page_size(CB::c_in6, stats_tile_size);
+    auto c_in6_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_6, stats_df}}).set_page_size(CBIndex::c_6, stats_tile_size);
     auto cb_in6_id = CreateCircularBuffer(program, core_grid, c_in6_config);
 
     // cb_l_in
-    auto c_in7_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_in7, stats_df}}).set_page_size(CB::c_in7, stats_tile_size);
+    auto c_in7_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_7, stats_df}}).set_page_size(CBIndex::c_7, stats_tile_size);
     auto c_in7_id = CreateCircularBuffer(program, core_grid, c_in7_config);
 
     // cb_qk_im
-    auto c_intermed0_config = CircularBufferConfig(qk_tiles * im_tile_size, {{CB::c_intermed0, im_df}}).set_page_size(CB::c_intermed0, im_tile_size);
+    auto c_intermed0_config = CircularBufferConfig(qk_tiles * im_tile_size, {{CBIndex::c_24, im_df}}).set_page_size(CBIndex::c_24, im_tile_size);
     auto cb_intermed0_id = CreateCircularBuffer(program, core_grid, c_intermed0_config);
 
     // cb_out_im
-    auto c_intermed1_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{CB::c_intermed1, im_df}}).set_page_size(CB::c_intermed1, im_tile_size);
+    auto c_intermed1_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{CBIndex::c_25, im_df}}).set_page_size(CBIndex::c_25, im_tile_size);
     auto cb_intermed1_id = CreateCircularBuffer(program, core_grid, c_intermed1_config);
 
     // cb_out_accumulate_im
-    auto c_intermed2_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{CB::c_intermed2, im_df}}).set_page_size(CB::c_intermed2, im_tile_size);
+    auto c_intermed2_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{CBIndex::c_26, im_df}}).set_page_size(CBIndex::c_26, im_tile_size);
     auto cb_intermed2_id = CreateCircularBuffer(program, core_grid, c_intermed2_config);
 
     // cb_cur_max
-    auto c_intermed3_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_intermed3, stats_df}}).set_page_size(CB::c_intermed3, stats_tile_size);
+    auto c_intermed3_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_27, stats_df}}).set_page_size(CBIndex::c_27, stats_tile_size);
     auto cb_intermed3_id = CreateCircularBuffer(program, core_grid, c_intermed3_config);
 
     // cb_prev_max
-    auto c_intermed4_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_intermed4, stats_df}}).set_page_size(CB::c_intermed4, stats_tile_size);
+    auto c_intermed4_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_28, stats_df}}).set_page_size(CBIndex::c_28, stats_tile_size);
     auto cb_intermed4_id = CreateCircularBuffer(program, core_grid, c_intermed4_config);
 
     // cb_cur_sum
-    auto c_intermed5_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_intermed5, stats_df}}).set_page_size(CB::c_intermed5, stats_tile_size);
+    auto c_intermed5_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_29, stats_df}}).set_page_size(CBIndex::c_29, stats_tile_size);
     auto cb_intermed5_id = CreateCircularBuffer(program, core_grid, c_intermed5_config);
 
     // cb_prev_sum
-    auto c_intermed6_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_intermed6, stats_df}}).set_page_size(CB::c_intermed6, stats_tile_size);
+    auto c_intermed6_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_30, stats_df}}).set_page_size(CBIndex::c_30, stats_tile_size);
     auto cb_intermed6_id = CreateCircularBuffer(program, core_grid, c_intermed6_config);
 
     // cb_exp_max_diff
-    auto c_intermed7_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_intermed7, stats_df}}).set_page_size(CB::c_intermed7, stats_tile_size);
+    auto c_intermed7_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_31, stats_df}}).set_page_size(CBIndex::c_31, stats_tile_size);
     auto cb_intermed7_id = CreateCircularBuffer(program, core_grid, c_intermed7_config);
 
     // cb_prev_sum_2
-    auto c_out5_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_out5, stats_df}}).set_page_size(CB::c_out5, stats_tile_size);
+    auto c_out5_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_21, stats_df}}).set_page_size(CBIndex::c_21, stats_tile_size);
     auto c_out5_id = CreateCircularBuffer(program, core_grid, c_out5_config);
 
     // cb_exp_max_diff_2
-    auto c_out6_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_out6, stats_df}}).set_page_size(CB::c_out6, stats_tile_size);
+    auto c_out6_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_22, stats_df}}).set_page_size(CBIndex::c_22, stats_tile_size);
     auto c_out6_id = CreateCircularBuffer(program, core_grid, c_out6_config);
 
     // cb_out_accumulate_im_2
-    auto c_out7_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{CB::c_out7, im_df}}).set_page_size(CB::c_out7, im_tile_size);
+    auto c_out7_config = CircularBufferConfig(out_im_tiles * im_tile_size, {{CBIndex::c_23, im_df}}).set_page_size(CBIndex::c_23, im_tile_size);
     auto c_out7_id = CreateCircularBuffer(program, core_grid, c_out7_config);
 
     // Output
     // cb_out_o
-    auto c_out0_config = CircularBufferConfig(out0_t * stats_tile_size, {{CB::c_out0, stats_df}}).set_page_size(CB::c_out0, stats_tile_size);
+    auto c_out0_config = CircularBufferConfig(out0_t * stats_tile_size, {{CBIndex::c_16, stats_df}}).set_page_size(CBIndex::c_16, stats_tile_size);
     auto cb_out0_id = CreateCircularBuffer( program, core_grid, c_out0_config );
 
     // cb_out_m
-    auto c_out1_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_out1, stats_df}}).set_page_size(CB::c_out1, stats_tile_size);
+    auto c_out1_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_17, stats_df}}).set_page_size(CBIndex::c_17, stats_tile_size);
     auto cb_out1_id = CreateCircularBuffer(program, core_grid, c_out1_config);
 
     // cb_out_l
-    auto c_out2_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CB::c_out2, stats_df}}).set_page_size(CB::c_out2, stats_tile_size);
+    auto c_out2_config = CircularBufferConfig(statistics_tiles * stats_tile_size, {{CBIndex::c_18, stats_df}}).set_page_size(CBIndex::c_18, stats_tile_size);
     auto c_out2_id = CreateCircularBuffer(program, core_grid, c_out2_config);
 
     // when there are worker cores
     if (intermed_output_tiles > 0){
         // cb_intermed_out
-        auto c_out3_config = CircularBufferConfig(intermed_output_tiles * stats_tile_size, {{CB::c_out3, stats_df}}).set_page_size(CB::c_out3, stats_tile_size);
+        auto c_out3_config = CircularBufferConfig(intermed_output_tiles * stats_tile_size, {{CBIndex::c_19, stats_df}}).set_page_size(CBIndex::c_19, stats_tile_size);
         auto c_out3_id = CreateCircularBuffer(program, core_grid, c_out3_config);
     }
 
     // cb_out_final
-    auto c_out4_config = CircularBufferConfig(out0_t * out_tile_size, {{CB::c_out4, out_df}}).set_page_size(CB::c_out4, out_tile_size);
+    auto c_out4_config = CircularBufferConfig(out0_t * out_tile_size, {{CBIndex::c_20, out_df}}).set_page_size(CBIndex::c_20, out_tile_size);
     if (is_output_sharded) {
         c_out4_config.set_globally_allocated_address(*out0_buffer);
     }
diff --git a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.cpp b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.cpp
index d1da3bc4c99..8570be1576a 100644
--- a/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.cpp
+++ b/ttnn/cpp/ttnn/operations/transformer/sdpa_decode/sdpa_decode.cpp
@@ -8,6 +8,8 @@
 #include "ttnn/common/constants.hpp"
 #include "ttnn/run_operation.hpp"
 
+using namespace tt::tt_metal;
+
 namespace {
 inline uint32_t get_chunk_size(uint32_t s) {
     /*
diff --git a/ttnn/cpp/ttnn/operations/uniform/device/kernels/compute_uniform.cpp b/ttnn/cpp/ttnn/operations/uniform/device/kernels/compute_uniform.cpp
index fa2220e8edc..3e29acca0bd 100644
--- a/ttnn/cpp/ttnn/operations/uniform/device/kernels/compute_uniform.cpp
+++ b/ttnn/cpp/ttnn/operations/uniform/device/kernels/compute_uniform.cpp
@@ -23,7 +23,7 @@ void MAIN {
     const uint32_t num_tiles = get_arg_val<uint32_t>(4);
     const uint32_t end_id = start_id + num_tiles;
 
-    init_sfpu(intermed_cb_id);
+    init_sfpu(intermed_cb_id, intermed_cb_id);
 
     rand_tile_init(seed);
     for (uint32_t i = start_id; i < end_id; ++i) {
diff --git a/ttnn/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp b/ttnn/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp
index d2528852d46..02d243c24e5 100644
--- a/ttnn/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/uniform/device/uniform_program_factory.cpp
@@ -42,13 +42,13 @@ UniformDeviceOperation::ProgramFactory::cached_program_t UniformDeviceOperation:
     constexpr uint32_t in_out_num_tiles = 1;
     constexpr uint32_t intermed_num_tiles = 2;
 
-    constexpr uint32_t intermed_cb_id = CB::c_intermed0;
+    constexpr uint32_t intermed_cb_id = CBIndex::c_24;
     CircularBufferConfig cb_intermed_config =
         CircularBufferConfig(intermed_num_tiles * intermed_tile_size, {{intermed_cb_id, tt::DataFormat::Float32}})
             .set_page_size(intermed_cb_id, intermed_tile_size);
     CBHandle cb_intermed = tt_metal::CreateCircularBuffer(program, all_cores, cb_intermed_config);
 
-    constexpr uint32_t dst_cb_id = CB::c_in0;
+    constexpr uint32_t dst_cb_id = CBIndex::c_0;
     CircularBufferConfig cb_output_config =
         CircularBufferConfig(in_out_num_tiles * dtype_tile_size, {{dst_cb_id, out_data_format}})
             .set_page_size(dst_cb_id, dtype_tile_size);
diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
index df198691c5b..687af9a04bf 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
@@ -10,6 +10,8 @@
 #include "ttnn/tensor/types.hpp"
 #include "ttnn/distributed/api.hpp"
 
+using namespace tt::tt_metal;
+
 namespace tt {
 
 namespace tt_metal {
diff --git a/ttnn/cpp/ttnn/tensor/types.hpp b/ttnn/cpp/ttnn/tensor/types.hpp
index 19b3d3ca1c0..3666c710113 100644
--- a/ttnn/cpp/ttnn/tensor/types.hpp
+++ b/ttnn/cpp/ttnn/tensor/types.hpp
@@ -464,10 +464,10 @@ struct Shape {
     explicit Shape(const std::initializer_list<uint32_t> shape, const std::initializer_list<uint32_t> shape_with_tile_padding) :
         value{tt::tt_metal::LegacyShape{shape, shape_with_tile_padding}} {}
 
-    explicit Shape(tt::stl::Span<const uint32_t> shape, const Padding &padding) :
+    explicit Shape(tt::stl::Span<const uint32_t> shape, const tt::tt_metal::Padding &padding) :
         value{tt::tt_metal::LegacyShape{shape, padding}} {}
 
-    explicit Shape(const Shape &shape, const Padding &padding) :
+    explicit Shape(const Shape &shape, const tt::tt_metal::Padding &padding) :
         value{tt::tt_metal::LegacyShape{shape.value, padding}} {}
 
     Shape(const SimpleShape& shape): value{shape.view()} {}
diff --git a/ttnn/tt_lib/fused_ops/average_pool.py b/ttnn/tt_lib/fused_ops/average_pool.py
index 8c13bc472b0..7af3e9b2808 100644
--- a/ttnn/tt_lib/fused_ops/average_pool.py
+++ b/ttnn/tt_lib/fused_ops/average_pool.py
@@ -6,9 +6,9 @@
 import ttnn
 
 
-def run_avg_pool_on_device_wrapper(device):
-    def avg_pool2d(x, output_mem_config, output_dtype=None):
-        out = ttnn.avg_pool2d(x, memory_config=output_mem_config, dtype=output_dtype)
+def run_global_avg_pool_on_device_wrapper(device):
+    def global_avg_pool2d(x, output_mem_config, output_dtype=None):
+        out = ttnn.global_avg_pool2d(x, memory_config=output_mem_config, dtype=output_dtype)
         return out
 
-    return avg_pool2d
+    return global_avg_pool2d
diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py
index 8709220aa13..31c31e7bcd5 100644
--- a/ttnn/ttnn/__init__.py
+++ b/ttnn/ttnn/__init__.py
@@ -293,7 +293,6 @@ def auto_register_ttnn_cpp_operations(module):
 )
 
 from ttnn.operations.conv2d import Conv2dConfig, get_conv_padded_input_shape_and_mem_config, get_conv_output_dim
-from ttnn.operations.pool import avg_pool2d
 from ttnn.operations.conv1d import Conv1d, Conv1dConfig
 
 from ttnn.operations.transformer import SDPAProgramConfig
diff --git a/ttnn/ttnn/operations/moreh.py b/ttnn/ttnn/operations/moreh.py
index 28d27466d01..50ba216df5e 100644
--- a/ttnn/ttnn/operations/moreh.py
+++ b/ttnn/ttnn/operations/moreh.py
@@ -3,7 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
+import ttnn._ttnn
 
+abs = ttnn._ttnn.operations.moreh.moreh_abs_pow
 adam = ttnn._ttnn.operations.moreh.moreh_adam
 adamw = ttnn._ttnn.operations.moreh.moreh_adamw
 arange = ttnn._ttnn.operations.moreh.moreh_arange
diff --git a/ttnn/ttnn/operations/pool.py b/ttnn/ttnn/operations/pool.py
index bb4c3793a73..1fb97b768bf 100644
--- a/ttnn/ttnn/operations/pool.py
+++ b/ttnn/ttnn/operations/pool.py
@@ -43,7 +43,3 @@ def golden_global_avg_pool2d(input_tensor: ttnn.Tensor):
 
 
 ttnn.attach_golden_function(ttnn.global_avg_pool2d, golden_global_avg_pool2d)
-
-avg_pool2d = ttnn.register_python_operation(name="ttnn.avg_pool2d", golden_function=golden_global_avg_pool2d)(
-    ttnn._ttnn.operations.pool.avg_pool2d
-)