Merge branch 'main' into llama3/sharded-residual

tenstorrent · Nov 25, 2024 · 92a5fe5 · 92a5fe5
2 parents e24d18d + af73fe3
commit 92a5fe5
Show file tree

Hide file tree

Showing 986 changed files with 13,472 additions and 11,558 deletions.
diff --git a/.clang-format b/.clang-format
@@ -1,7 +1,7 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-AccessModifierOffset: -1
+AccessModifierOffset: -4
 AlignAfterOpenBracket: AlwaysBreak
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
@@ -51,7 +51,7 @@ ConstructorInitializerAllOnOneLineOrOnePerLine: true
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
-DerivePointerAlignment: true
+DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
@@ -93,7 +93,8 @@ PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Right
+PointerAlignment: Left
+ReferenceAlignment: Left
 RawStringFormats:
   - Language:        Cpp
     Delimiters:
@@ -123,7 +124,7 @@ RawStringFormats:
     CanonicalDelimiter: ''
     BasedOnStyle:    google
 ReflowComments:  true
-SortIncludes:    true
+SortIncludes:    false
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true

diff --git a/.github/workflows/publish-release-image-wrapper.yaml b/.github/workflows/publish-release-image-wrapper.yaml
@@ -0,0 +1,12 @@
+name: "Create and Publish Release Docker Image"
+
+on:
+  workflow_call:
+  workflow_dispatch:
+
+jobs:
+  to_be_filled_out:
+    steps:
+      - name: This workflow will be filled out in https://github.com/tenstorrent/tt-metal/pull/15013
+        run: |
+          echo "NOOP"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -207,6 +207,7 @@ string(TOUPPER "$ENV{ARCH_NAME}" ARCH_NAME_DEF)
 add_compile_definitions(ARCH_${ARCH_NAME_DEF})
 add_compile_options(
     -Werror
+    -Wno-deprecated-declarations
     -Wdelete-non-virtual-dtor
     -Wreturn-type
     -Wswitch
@@ -226,9 +227,7 @@ add_compile_options(
     "$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-this-capture>"
     "$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-volatile>"
     "$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-builtins>"
-    "$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-declarations>"
     "$<$<CXX_COMPILER_ID:GNU>:-Wno-deprecated>"
-    "$<$<CXX_COMPILER_ID:GNU>:-Wno-deprecated-declarations>"
     "$<$<CXX_COMPILER_ID:GNU>:-Wno-attributes>"
     "$<$<CXX_COMPILER_ID:GNU>:-Wno-stringop-overread>"
     "$<$<CXX_COMPILER_ID:GNU>:-Wno-stringop-overflow>"

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -53,7 +53,7 @@ tests/scripts/tgg/ @afuller-TT @ttmchiou
 tt_metal/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema
 tt_metal/host_api.hpp @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @davorchap
 tt_metal/impl/device/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @davorchap @cfjchu
-tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal
+tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal @omilyutin-tt
 tt_metal/**/requirements*.txt @tt-rkim @TT-billteng @ttmchiou
 
 # metal - dispatch
@@ -105,13 +105,13 @@ ttnn/cpp/ttnn/operations/ccl/ @SeanNijjar @cfjchu
 ttnn/cpp/ttnn/operations/pool/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
 ttnn/cpp/ttnn/operations/conv/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic @bbradelTT
 ttnn/cpp/ttnn/operations/sliding_window/ @mywoodstock @sankarmanoj-tt @pavlejosipovic
-ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @jaykru-tt @yugi957
+ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @jaykru-tt @yugi957 @jvegaTT @llongTT
 ttnn/cpp/ttnn/operations/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
 ttnn/cpp/ttnn/operations/experimental/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
 ttnn/cpp/ttnn/operations/eltwise/ @patrickroberts @yan-zaretskiy @eyonland
-ttnn/cpp/ttnn/operations/reduction/ @SeanNijjar @ntarafdar @sjameelTT
+ttnn/cpp/ttnn/operations/reduction/ @bbradelTT @asandhupatlaTT @sjameelTT
 ttnn/cpp/ttnn/operations/normalization/ @yugaoTT @tt-aho
-ttnn/cpp/ttnn/operations/embedding/ @ntarafdar @tt-aho @TT-BrianLiu
+ttnn/cpp/ttnn/operations/embedding/ @ntarafdar @tt-aho @TT-BrianLiu @yugi957 @sjameelTT @jaykru-tt @llongTT
 ttnn/cpp/ttnn/operations/embedding_backward/ @TT-BrianLiu @yan-zaretskiy
 ttnn/ttnn/operations/eltwise @patrickroberts @yan-zaretskiy @eyonland
 tests/ttnn/ @ayerofieiev-tt @dmakoviichuk-tt @rfurko-tt @cfjchu @TT-BrianLiu @razorback3 @dongjin-na
@@ -122,12 +122,12 @@ tests/sweep_framework/ @xanderchin @jdesousa-TT @sjameelTT
 tests/sweep_framework/sweeps
 tests/sweep_framework/sweeps/eltwise/ @patrickroberts @yan-zaretskiy @eyonland
 tests/sweep_framework/sweeps/conv2d/  @nkpatel-tt @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
-tests/sweep_framework/sweeps/data_movement/  @sjameelTT @ntarafdar @jaykru-tt @yugi957
+tests/sweep_framework/sweeps/data_movement/  @sjameelTT @ntarafdar @jaykru-tt @yugi957 @llongTT @jvegaTT
 
 # TTNN Distributed
-ttnn/cpp/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
-ttnn/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
-tests/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
+ttnn/cpp/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt
+ttnn/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt
+tests/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt
 
 # models
 /models/ @tt-rkim @uaydonat

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -251,8 +251,8 @@ The new fangled way we run our tests is with Googletest. The way we generally
 structure our tests with this framework is to bundle it into a single
 executable.
 
-You can use `--gtest_filter_test` to filter out the specific test you'd like.
-For example, to build and run the `CommonFixture.DRAMLoopbackSingleCore` on
+You can use `--gtest_filter` to filter out the specific test you'd like.
+For example, to build and run the `DispatchFixture.TensixDRAMLoopbackSingleCore` on
 fast dispatch, you can
 
 1. Build the unit tests:
@@ -261,7 +261,7 @@ fast dispatch, you can
    ```
 2. Run the test:
    ```
-   ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommonFixture.DRAMLoopbackSingleCore"
+   ./build/test/tt_metal/unit_tests_api --gtest_filter="DispatchFixture.TensixDRAMLoopbackSingleCore"
    ```
 
 On slow dispatch, to run another specific test, the equivalent would be:
@@ -270,7 +270,7 @@ On slow dispatch, to run another specific test, the equivalent would be:
 2. Run with the slow dispatch mode:
    ```
    export TT_METAL_SLOW_DISPATCH_MODE=1
-   ./build/test/tt_metal/unit_tests/fast_dispatch --gtest_filter_test="BasicFixture.TestL1BuffersAllocatedTopDown"
+   ./build/test/tt_metal/unit_tests/unit_tests_api --gtest_filter="DeviceSingleCardBufferFixture.TestL1BuffersAllocatedTopDown"
    ```
 
 We have split our tests into the two dispatch modes for less pollution of state

diff --git a/METALIUM_GUIDE.md b/METALIUM_GUIDE.md
@@ -128,26 +128,26 @@ void MAIN {
   mm_init();
   acquire_dst();
 
-  cb_wait_front(tt::CB::c_in0, /* number of tiles */ 1);
-  cb_wait_front(tt::CB::c_in1, /* number of tiles */ 1);
+  cb_wait_front(tt::CBIndex::c_0, /* number of tiles */ 1);
+  cb_wait_front(tt::CBIndex::c_1, /* number of tiles */ 1);
 
-  matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
+  matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
 
-  cb_pop_front(tt::CB::c_in1, /* number of tiles */ 1);
-  cb_pop_front(tt::CB::c_in0, /* number of tiles */ 1);
+  cb_pop_front(tt::CBIndex::c_1, /* number of tiles */ 1);
+  cb_pop_front(tt::CBIndex::c_0, /* number of tiles */ 1);
 
-  cb_reserve_back(tt::CB::c_out0, /* number of tiles */ 1);
-  pack_tile(0, tt::CB::c_out0);
-  cb_push_back(tt::CB::c_out0, /* number of tiles */ 1);
+  cb_reserve_back(tt::CBIndex::c_16, /* number of tiles */ 1);
+  pack_tile(0, tt::CBIndex::c_16);
+  cb_push_back(tt::CBIndex::c_16, /* number of tiles */ 1);
 
   release_dst();
 }
 }  // namespace NAMESPACE
 ```
 
-It takes two matrix tiles from `tt::CB::c_in0` and `tt::CB::c_in0` L1 and
+It takes two matrix tiles from `tt::CBIndex::c_0` and `tt::CBIndex::c_0` L1 and
 conducts a single-tile matrix multiplication. Finally, it packs the result to
-`tt::CB::c_out0`.
+`tt::CBIndex::c_16`.
 
 Note that tile registers are acquired by `acquire_dst()`, but actually we can
 use `tile_regs_..()` functions for the more fine-grained tile register lock
@@ -299,23 +299,23 @@ namespace NAMESPACE {
 void MAIN {
   mm_init();
 
-  cb_wait_front(tt::CB::c_in0, /* number of tiles */ 1);
-  cb_wait_front(tt::CB::c_in1, /* number of tiles */ 1);
+  cb_wait_front(tt::CBIndex::c_0, /* number of tiles */ 1);
+  cb_wait_front(tt::CBIndex::c_1, /* number of tiles */ 1);
 
   tile_regs_acquire();
 
-  matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
+  matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
 
   tile_regs_commit();
 
-  cb_pop_front(tt::CB::c_in1, /* number of tiles */ 1);
-  cb_pop_front(tt::CB::c_in0, /* number of tiles */ 1);
+  cb_pop_front(tt::CBIndex::c_1, /* number of tiles */ 1);
+  cb_pop_front(tt::CBIndex::c_0, /* number of tiles */ 1);
 
   tile_regs_wait();
 
-  cb_reserve_back(tt::CB::c_out0, /* number of tiles */ 1);
-  pack_tile(0, tt::CB::c_out0);
-  cb_push_back(tt::CB::c_out0, /* number of tiles */ 1);
+  cb_reserve_back(tt::CBIndex::c_16, /* number of tiles */ 1);
+  pack_tile(0, tt::CBIndex::c_16);
+  cb_push_back(tt::CBIndex::c_16, /* number of tiles */ 1);
 
   tile_regs_release();
 }
@@ -367,9 +367,9 @@ void MAIN {
     uint32_t per_core_block_cnt = get_arg_val<uint32_t>(0);
     uint32_t per_core_block_size = get_arg_val<uint32_t>(1); // should be <= 8 in this kernel
 
-    constexpr auto cb_in0 = tt::CB::c_in0;
-    constexpr auto cb_in1 = tt::CB::c_in1;
-    constexpr auto cb_out0 =  tt::CB::c_out0;
+    constexpr auto cb_in0 = tt::CBIndex::c_0;
+    constexpr auto cb_in1 = tt::CBIndex::c_1;
+    constexpr auto cb_out0 =  tt::CBIndex::c_16;
 
     binary_op_init_common(cb_in0, cb_in1, cb_out0);
     add_tiles_init();
@@ -400,7 +400,7 @@ void MAIN {
         cb_pop_front(cb_in0, per_core_block_size);
         cb_pop_front(cb_in1, per_core_block_size);
 
-        // push a block of tiles to output CB
+        // push a block of tiles to output CBIndex
         cb_push_back(cb_out0, per_core_block_size);
     }
 

diff --git a/docs/source/common/images/MFB-Fig12.png b/docs/source/common/images/MFB-Fig12.png
diff --git a/docs/source/common/images/MFB-Fig3a.png b/docs/source/common/images/MFB-Fig3a.png
diff --git a/docs/source/common/images/MFB-Fig11.png → docs/source/common/images/MfB-Fig11.png b/docs/source/common/images/MFB-Fig11.png → docs/source/common/images/MfB-Fig11.png
diff --git a/docs/source/common/images/MfB-Fig12.png b/docs/source/common/images/MfB-Fig12.png
diff --git a/docs/source/common/images/MFB-Fig2.png → docs/source/common/images/MfB-Fig2.png b/docs/source/common/images/MFB-Fig2.png → docs/source/common/images/MfB-Fig2.png
diff --git a/docs/source/common/images/MfB-Fig3a.png b/docs/source/common/images/MfB-Fig3a.png
diff --git a/docs/source/tt-metalium/tools/kernel_print.rst b/docs/source/tt-metalium/tools/kernel_print.rst
@@ -83,7 +83,7 @@ Data from Circular Buffers can be printed using the ``TileSlice`` object. It can
 | print_untilized | bool                | Whether to untilize the CB data while printing it (always done for block float formats), default ``true``.                                                   |
 +-----------------+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
-An example of how to print data from a CB (in this case, ``CB::c_intermed1``) is shown below.  Note that sampling happens relative
+An example of how to print data from a CB (in this case, ``CBIndex::c_25``) is shown below.  Note that sampling happens relative
 to the current CB read or write pointer. This means that for printing a tile read from the front of the CB, the
 ``DPRINT`` call has to occur between the ``cb_wait_front`` and ``cb_pop_front`` calls. For printing a tile from the
 back of the CB, the ``DPRINT`` call has to occur between the ``cb_reserve_back`` and ``cb_push_back`` calls. Currently supported data
@@ -94,15 +94,15 @@ formats for printing from CBs are ``DataFormat::Float32``, ``DataFormat::Float16
     #include "debug/dprint.h"  // required in all kernels using DPRINT
 
     void kernel_main() {
-        // Assuming the tile we want to print from CB::c_intermed1 is from the front the CB, print must happen after
+        // Assuming the tile we want to print from CBIndex::c_25 is from the front the CB, print must happen after
         // this call. If the tile is from the back of the CB, then print must happen after cb_reserve_back().
-        cb_wait_front(CB::c_intermed1, 1);
+        cb_wait_front(CBIndex::c_25, 1);
         ...
 
-        // Extract a numpy slice `[0:32:16, 0:32:16]` from tile `0` from `CB::c_intermed1` and print it.
-        DPRINT << TSLICE(CB::c_intermed1, 0, SliceRange::hw0_32_16()) << ENDL();
+        // Extract a numpy slice `[0:32:16, 0:32:16]` from tile `0` from `CBIndex::c_25` and print it.
+        DPRINT << TSLICE(CBIndex::c_25, 0, SliceRange::hw0_32_16()) << ENDL();
         // Note that since the MATH core does not have access to CBs, so this is an invalid print:
-        DPRINT_MATH({ DPRINT  << TSLICE(CB::c_intermed1, 0, SliceRange::hw0_32_16()) << ENDL(); }); // Invalid
+        DPRINT_MATH({ DPRINT  << TSLICE(CBIndex::c_25, 0, SliceRange::hw0_32_16()) << ENDL(); }); // Invalid
 
         // Print a full tile
         for (int32_t r = 0; r < 32; ++r) {
@@ -118,5 +118,5 @@ formats for printing from CBs are ``DataFormat::Float32``, ``DataFormat::Float16
         }
 
         ...
-        cb_pop_front(CB::c_intermed1, 1);
+        cb_pop_front(CBIndex::c_25, 1);
     }
diff --git a/docs/source/tt-metalium/tt_metal/examples/eltwise_binary.rst b/docs/source/tt-metalium/tt_metal/examples/eltwise_binary.rst
@@ -32,19 +32,19 @@ We already have set the circular buffers needed for compute data communication.
 
 .. code-block:: cpp
 
-  constexpr uint32_t src0_cb_index = CB::c_in0;
+  constexpr uint32_t src0_cb_index = CBIndex::c_0;
   constexpr uint32_t src0_cb_addr = 200 * 1024;
   constexpr uint32_t num_input_tiles = 2;
   constexpr uint32_t input_cb_size = num_input_tiles * single_tile_size;
   CircularBufferConfig cb_src0_config = CircularBufferConfig(input_cb_size, {{src0_cb_index, tt::DataFormat::Float16_b}}, src0_cb_addr).set_page_size(src0_cb_index, single_tile_size);
   CBHandle cb_src0 = v0::CreateCircularBuffer(program, core, cb_src0_config);
 
-  constexpr uint32_t src1_cb_index = CB::c_in1;
+  constexpr uint32_t src1_cb_index = CBIndex::c_1;
   constexpr uint32_t src1_cb_addr = 300 * 1024;
   CircularBufferConfig cb_src1_config = CircularBufferConfig(input_cb_size, {{src1_cb_index, tt::DataFormat::Float16_b}}, src1_cb_addr).set_page_size(src1_cb_index, single_tile_size);
   CBHandle cb_src1 = v0::CreateCircularBuffer(program, core, cb_src1_config);
 
-  constexpr uint32_t output_cb_index = CB::c_out0;
+  constexpr uint32_t output_cb_index = CBIndex::c_16;
   constexpr uint32_t output_cb_addr = 400 * 1024;
   constexpr uint32_t num_output_tiles = 2;
   constexpr uint32_t input_cb_size = num_input_tiles * single_tile_size;

diff --git a/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst b/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst
@@ -31,12 +31,12 @@ compute, and writer engines.
 
 .. code-block:: cpp
 
-    constexpr uint32_t src0_cb_index = CB::c_in0;
+    constexpr uint32_t src0_cb_index = CBIndex::c_0;
     constexpr uint32_t num_input_tiles = 2;
     CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
     CBHandle cb_src0 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src0_config);
 
-    constexpr uint32_t output_cb_index = CB::c_out0;
+    constexpr uint32_t output_cb_index = CBIndex::c_16;
     constexpr uint32_t num_output_tiles = 2;
     CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
     CBHandle cb_output = tt_metal::v0::CreateCircularBuffer(program, core, cb_output_config);

diff --git a/...ce/tt-metalium/tt_metal/examples/matmul_multi_core_optimizations/data_reuse.rst b/...ce/tt-metalium/tt_metal/examples/matmul_multi_core_optimizations/data_reuse.rst
@@ -40,8 +40,8 @@ In addition to our double-buffer config, we introduce a third circular buffer de
 
 .. code-block:: cpp
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
-    uint32_t interm0_cb_index = 24; // Index for the intermediate circular buffer
+    uint32_t output_cb_index = CBIndex::c_16;
+    uint32_t interm0_cb_index = CBIndex::c_24; // Index for the intermediate circular buffer
     std::map<uint8_t, tt::DataFormat> output_cb_data_format_spec {
         {output_cb_index, cb_data_format}, // Output buffer configuration
         {interm0_cb_index, cb_data_format} // Intermediate buffer configuration
@@ -173,16 +173,16 @@ a. **Preparing the Intermediate Buffer**:
 
     .. code-block:: cpp
 
-        cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+        cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
 
     - **Storing Partial Results**: Partial results are stored via a packing mechanism with ``pack_tile(...)`` into the above reserved space.
 
     .. code-block:: cpp
 
         for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-            pack_tile(i, tt::CB::c_intermed0);
+            pack_tile(i, tt::CBIndex::c_24);
         }
-        cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
+        cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
 
 b. **Computing with Partial Results**:
 
@@ -191,11 +191,11 @@ b. **Computing with Partial Results**:
     .. code-block:: cpp
 
         if (enable_reload) {
-            cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+            cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
             for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
-                copy_tile(tt::CB::c_intermed0, i, i);
+                copy_tile(tt::CBIndex::c_24, i, i);
             }
-            cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
+            cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
         }
 
     - **Execution with `matmul_tiles`**: Now we are ready to compute partial results and integrate them back into the computation stream (or for the last block of computation, culminate our data reuse to produce the final output tensor).  We call the ``matmul_tiles(...)`` function to execute our matmul on the core's subblocks of tiles.
@@ -211,7 +211,7 @@ b. **Computing with Partial Results**:
                 for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
                     int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
                     int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
-                    matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
+                    matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
                     in1_index_inner_dim_offset += in1_per_core_w;
                 }
                 dst_index++;

diff --git a/docs/source/tt-metalium/tt_metal/examples/matmul_single_core.rst b/docs/source/tt-metalium/tt_metal/examples/matmul_single_core.rst
@@ -147,18 +147,18 @@ double buffering..
 
 .. code-block:: cpp
 
-    uint32_t src0_cb_index = CB::c_in0; //0
+    uint32_t src0_cb_index = CBIndex::c_0; //0
     uint32_t num_input_tiles = 2;
     tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
         .set_page_size(src0_cb_index, single_tile_size);
     auto cb_src0 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src0_config);
 
-    uint32_t src1_cb_index = CB::c_in1; // 1
+    uint32_t src1_cb_index = CBIndex::c_1; // 1
     tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
         .set_page_size(src1_cb_index, single_tile_size);
     auto cb_src1 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src1_config);
 
-    uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
+    uint32_t output_cb_index = tt::CBIndex::c_16;
     uint32_t num_output_tiles = 2;
     tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
         .set_page_size(output_cb_index, single_tile_size);

diff --git a/models/demos/distilbert/tests/test_perf_distilbert.py b/models/demos/distilbert/tests/test_perf_distilbert.py
@@ -152,7 +152,7 @@ def test_distilbert_perf_device(batch_size, test, reset_seeds):
     margin = 0.03
     num_iterations = 1
     if is_grayskull():
-        expected_perf = 40.8772
+        expected_perf = 57.3
     elif is_wormhole_b0():
         expected_perf = 103.884