Skip to content

Commit

Permalink
Merge branch 'main' into llama3/sharded-residual
Browse files Browse the repository at this point in the history
  • Loading branch information
yieldthought authored Nov 25, 2024
2 parents e24d18d + af73fe3 commit 92a5fe5
Show file tree
Hide file tree
Showing 986 changed files with 13,472 additions and 11,558 deletions.
9 changes: 5 additions & 4 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AccessModifierOffset: -4
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
Expand Down Expand Up @@ -51,7 +51,7 @@ ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
Expand Down Expand Up @@ -93,7 +93,8 @@ PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Right
PointerAlignment: Left
ReferenceAlignment: Left
RawStringFormats:
- Language: Cpp
Delimiters:
Expand Down Expand Up @@ -123,7 +124,7 @@ RawStringFormats:
CanonicalDelimiter: ''
BasedOnStyle: google
ReflowComments: true
SortIncludes: true
SortIncludes: false
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/publish-release-image-wrapper.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: "Create and Publish Release Docker Image"

on:
workflow_call:
workflow_dispatch:

jobs:
to_be_filled_out:
steps:
- name: This workflow will be filled out in https://github.com/tenstorrent/tt-metal/pull/15013
run: |
echo "NOOP"
3 changes: 1 addition & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ string(TOUPPER "$ENV{ARCH_NAME}" ARCH_NAME_DEF)
add_compile_definitions(ARCH_${ARCH_NAME_DEF})
add_compile_options(
-Werror
-Wno-deprecated-declarations
-Wdelete-non-virtual-dtor
-Wreturn-type
-Wswitch
Expand All @@ -226,9 +227,7 @@ add_compile_options(
"$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-this-capture>"
"$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-volatile>"
"$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-builtins>"
"$<$<CXX_COMPILER_ID:Clang>:-Wno-deprecated-declarations>"
"$<$<CXX_COMPILER_ID:GNU>:-Wno-deprecated>"
"$<$<CXX_COMPILER_ID:GNU>:-Wno-deprecated-declarations>"
"$<$<CXX_COMPILER_ID:GNU>:-Wno-attributes>"
"$<$<CXX_COMPILER_ID:GNU>:-Wno-stringop-overread>"
"$<$<CXX_COMPILER_ID:GNU>:-Wno-stringop-overflow>"
Expand Down
16 changes: 8 additions & 8 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ tests/scripts/tgg/ @afuller-TT @ttmchiou
tt_metal/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema
tt_metal/host_api.hpp @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @davorchap
tt_metal/impl/device/ @abhullar-tt @pgkeller @aliuTT @tt-aho @tt-dma @tt-asaigal @ubcheema @davorchap @cfjchu
tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal
tt_metal/distributed/ @cfjchu @aliuTT @tt-asaigal @omilyutin-tt
tt_metal/**/requirements*.txt @tt-rkim @TT-billteng @ttmchiou

# metal - dispatch
Expand Down Expand Up @@ -105,13 +105,13 @@ ttnn/cpp/ttnn/operations/ccl/ @SeanNijjar @cfjchu
ttnn/cpp/ttnn/operations/pool/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
ttnn/cpp/ttnn/operations/conv/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic @bbradelTT
ttnn/cpp/ttnn/operations/sliding_window/ @mywoodstock @sankarmanoj-tt @pavlejosipovic
ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @jaykru-tt @yugi957
ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @jaykru-tt @yugi957 @jvegaTT @llongTT
ttnn/cpp/ttnn/operations/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
ttnn/cpp/ttnn/operations/experimental/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
ttnn/cpp/ttnn/operations/eltwise/ @patrickroberts @yan-zaretskiy @eyonland
ttnn/cpp/ttnn/operations/reduction/ @SeanNijjar @ntarafdar @sjameelTT
ttnn/cpp/ttnn/operations/reduction/ @bbradelTT @asandhupatlaTT @sjameelTT
ttnn/cpp/ttnn/operations/normalization/ @yugaoTT @tt-aho
ttnn/cpp/ttnn/operations/embedding/ @ntarafdar @tt-aho @TT-BrianLiu
ttnn/cpp/ttnn/operations/embedding/ @ntarafdar @tt-aho @TT-BrianLiu @yugi957 @sjameelTT @jaykru-tt @llongTT
ttnn/cpp/ttnn/operations/embedding_backward/ @TT-BrianLiu @yan-zaretskiy
ttnn/ttnn/operations/eltwise @patrickroberts @yan-zaretskiy @eyonland
tests/ttnn/ @ayerofieiev-tt @dmakoviichuk-tt @rfurko-tt @cfjchu @TT-BrianLiu @razorback3 @dongjin-na
Expand All @@ -122,12 +122,12 @@ tests/sweep_framework/ @xanderchin @jdesousa-TT @sjameelTT
tests/sweep_framework/sweeps
tests/sweep_framework/sweeps/eltwise/ @patrickroberts @yan-zaretskiy @eyonland
tests/sweep_framework/sweeps/conv2d/ @nkpatel-tt @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
tests/sweep_framework/sweeps/data_movement/ @sjameelTT @ntarafdar @jaykru-tt @yugi957
tests/sweep_framework/sweeps/data_movement/ @sjameelTT @ntarafdar @jaykru-tt @yugi957 @llongTT @jvegaTT

# TTNN Distributed
ttnn/cpp/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
ttnn/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
tests/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt
ttnn/cpp/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt
ttnn/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt
tests/ttnn/distributed/ @cfjchu @ayerofieiev-tt @dmakoviichuk-tt @omilyutin-tt

# models
/models/ @tt-rkim @uaydonat
Expand Down
8 changes: 4 additions & 4 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ The new fangled way we run our tests is with Googletest. The way we generally
structure our tests with this framework is to bundle it into a single
executable.

You can use `--gtest_filter_test` to filter out the specific test you'd like.
For example, to build and run the `CommonFixture.DRAMLoopbackSingleCore` on
You can use `--gtest_filter` to filter out the specific test you'd like.
For example, to build and run the `DispatchFixture.TensixDRAMLoopbackSingleCore` on
fast dispatch, you can

1. Build the unit tests:
Expand All @@ -261,7 +261,7 @@ fast dispatch, you can
```
2. Run the test:
```
./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommonFixture.DRAMLoopbackSingleCore"
./build/test/tt_metal/unit_tests_api --gtest_filter="DispatchFixture.TensixDRAMLoopbackSingleCore"
```

On slow dispatch, to run another specific test, the equivalent would be:
Expand All @@ -270,7 +270,7 @@ On slow dispatch, to run another specific test, the equivalent would be:
2. Run with the slow dispatch mode:
```
export TT_METAL_SLOW_DISPATCH_MODE=1
./build/test/tt_metal/unit_tests/fast_dispatch --gtest_filter_test="BasicFixture.TestL1BuffersAllocatedTopDown"
./build/test/tt_metal/unit_tests/unit_tests_api --gtest_filter="DeviceSingleCardBufferFixture.TestL1BuffersAllocatedTopDown"
```

We have split our tests into the two dispatch modes for less pollution of state
Expand Down
44 changes: 22 additions & 22 deletions METALIUM_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,26 +128,26 @@ void MAIN {
mm_init();
acquire_dst();
cb_wait_front(tt::CB::c_in0, /* number of tiles */ 1);
cb_wait_front(tt::CB::c_in1, /* number of tiles */ 1);
cb_wait_front(tt::CBIndex::c_0, /* number of tiles */ 1);
cb_wait_front(tt::CBIndex::c_1, /* number of tiles */ 1);
matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
cb_pop_front(tt::CB::c_in1, /* number of tiles */ 1);
cb_pop_front(tt::CB::c_in0, /* number of tiles */ 1);
cb_pop_front(tt::CBIndex::c_1, /* number of tiles */ 1);
cb_pop_front(tt::CBIndex::c_0, /* number of tiles */ 1);
cb_reserve_back(tt::CB::c_out0, /* number of tiles */ 1);
pack_tile(0, tt::CB::c_out0);
cb_push_back(tt::CB::c_out0, /* number of tiles */ 1);
cb_reserve_back(tt::CBIndex::c_16, /* number of tiles */ 1);
pack_tile(0, tt::CBIndex::c_16);
cb_push_back(tt::CBIndex::c_16, /* number of tiles */ 1);
release_dst();
}
} // namespace NAMESPACE
```

It takes two matrix tiles from `tt::CB::c_in0` and `tt::CB::c_in0` L1 and
It takes two matrix tiles from `tt::CBIndex::c_0` and `tt::CBIndex::c_0` L1 and
conducts a single-tile matrix multiplication. Finally, it packs the result to
`tt::CB::c_out0`.
`tt::CBIndex::c_16`.

Note that tile registers are acquired by `acquire_dst()`, but actually we can
use `tile_regs_..()` functions for the more fine-grained tile register lock
Expand Down Expand Up @@ -299,23 +299,23 @@ namespace NAMESPACE {
void MAIN {
mm_init();
cb_wait_front(tt::CB::c_in0, /* number of tiles */ 1);
cb_wait_front(tt::CB::c_in1, /* number of tiles */ 1);
cb_wait_front(tt::CBIndex::c_0, /* number of tiles */ 1);
cb_wait_front(tt::CBIndex::c_1, /* number of tiles */ 1);
tile_regs_acquire();
matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, 0, 0, 0, false);
matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, 0, 0, 0, false);
tile_regs_commit();
cb_pop_front(tt::CB::c_in1, /* number of tiles */ 1);
cb_pop_front(tt::CB::c_in0, /* number of tiles */ 1);
cb_pop_front(tt::CBIndex::c_1, /* number of tiles */ 1);
cb_pop_front(tt::CBIndex::c_0, /* number of tiles */ 1);
tile_regs_wait();
cb_reserve_back(tt::CB::c_out0, /* number of tiles */ 1);
pack_tile(0, tt::CB::c_out0);
cb_push_back(tt::CB::c_out0, /* number of tiles */ 1);
cb_reserve_back(tt::CBIndex::c_16, /* number of tiles */ 1);
pack_tile(0, tt::CBIndex::c_16);
cb_push_back(tt::CBIndex::c_16, /* number of tiles */ 1);
tile_regs_release();
}
Expand Down Expand Up @@ -367,9 +367,9 @@ void MAIN {
uint32_t per_core_block_cnt = get_arg_val<uint32_t>(0);
uint32_t per_core_block_size = get_arg_val<uint32_t>(1); // should be <= 8 in this kernel

constexpr auto cb_in0 = tt::CB::c_in0;
constexpr auto cb_in1 = tt::CB::c_in1;
constexpr auto cb_out0 = tt::CB::c_out0;
constexpr auto cb_in0 = tt::CBIndex::c_0;
constexpr auto cb_in1 = tt::CBIndex::c_1;
constexpr auto cb_out0 = tt::CBIndex::c_16;

binary_op_init_common(cb_in0, cb_in1, cb_out0);
add_tiles_init();
Expand Down Expand Up @@ -400,7 +400,7 @@ void MAIN {
cb_pop_front(cb_in0, per_core_block_size);
cb_pop_front(cb_in1, per_core_block_size);

// push a block of tiles to output CB
// push a block of tiles to output CBIndex
cb_push_back(cb_out0, per_core_block_size);
}

Expand Down
Binary file removed docs/source/common/images/MFB-Fig12.png
Binary file not shown.
Binary file removed docs/source/common/images/MFB-Fig3a.png
Binary file not shown.
File renamed without changes
Binary file modified docs/source/common/images/MfB-Fig12.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes
1 change: 1 addition & 0 deletions docs/source/common/images/MfB-Fig3a.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 7 additions & 7 deletions docs/source/tt-metalium/tools/kernel_print.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Data from Circular Buffers can be printed using the ``TileSlice`` object. It can
| print_untilized | bool | Whether to untilize the CB data while printing it (always done for block float formats), default ``true``. |
+-----------------+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+

An example of how to print data from a CB (in this case, ``CB::c_intermed1``) is shown below. Note that sampling happens relative
An example of how to print data from a CB (in this case, ``CBIndex::c_25``) is shown below. Note that sampling happens relative
to the current CB read or write pointer. This means that for printing a tile read from the front of the CB, the
``DPRINT`` call has to occur between the ``cb_wait_front`` and ``cb_pop_front`` calls. For printing a tile from the
back of the CB, the ``DPRINT`` call has to occur between the ``cb_reserve_back`` and ``cb_push_back`` calls. Currently supported data
Expand All @@ -94,15 +94,15 @@ formats for printing from CBs are ``DataFormat::Float32``, ``DataFormat::Float16
#include "debug/dprint.h" // required in all kernels using DPRINT

void kernel_main() {
// Assuming the tile we want to print from CB::c_intermed1 is from the front the CB, print must happen after
// Assuming the tile we want to print from CBIndex::c_25 is from the front the CB, print must happen after
// this call. If the tile is from the back of the CB, then print must happen after cb_reserve_back().
cb_wait_front(CB::c_intermed1, 1);
cb_wait_front(CBIndex::c_25, 1);
...

// Extract a numpy slice `[0:32:16, 0:32:16]` from tile `0` from `CB::c_intermed1` and print it.
DPRINT << TSLICE(CB::c_intermed1, 0, SliceRange::hw0_32_16()) << ENDL();
// Extract a numpy slice `[0:32:16, 0:32:16]` from tile `0` from `CBIndex::c_25` and print it.
DPRINT << TSLICE(CBIndex::c_25, 0, SliceRange::hw0_32_16()) << ENDL();
// Note that since the MATH core does not have access to CBs, so this is an invalid print:
DPRINT_MATH({ DPRINT << TSLICE(CB::c_intermed1, 0, SliceRange::hw0_32_16()) << ENDL(); }); // Invalid
DPRINT_MATH({ DPRINT << TSLICE(CBIndex::c_25, 0, SliceRange::hw0_32_16()) << ENDL(); }); // Invalid

// Print a full tile
for (int32_t r = 0; r < 32; ++r) {
Expand All @@ -118,5 +118,5 @@ formats for printing from CBs are ``DataFormat::Float32``, ``DataFormat::Float16
}

...
cb_pop_front(CB::c_intermed1, 1);
cb_pop_front(CBIndex::c_25, 1);
}
6 changes: 3 additions & 3 deletions docs/source/tt-metalium/tt_metal/examples/eltwise_binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,19 @@ We already have set the circular buffers needed for compute data communication.

.. code-block:: cpp
constexpr uint32_t src0_cb_index = CB::c_in0;
constexpr uint32_t src0_cb_index = CBIndex::c_0;
constexpr uint32_t src0_cb_addr = 200 * 1024;
constexpr uint32_t num_input_tiles = 2;
constexpr uint32_t input_cb_size = num_input_tiles * single_tile_size;
CircularBufferConfig cb_src0_config = CircularBufferConfig(input_cb_size, {{src0_cb_index, tt::DataFormat::Float16_b}}, src0_cb_addr).set_page_size(src0_cb_index, single_tile_size);
CBHandle cb_src0 = v0::CreateCircularBuffer(program, core, cb_src0_config);
constexpr uint32_t src1_cb_index = CB::c_in1;
constexpr uint32_t src1_cb_index = CBIndex::c_1;
constexpr uint32_t src1_cb_addr = 300 * 1024;
CircularBufferConfig cb_src1_config = CircularBufferConfig(input_cb_size, {{src1_cb_index, tt::DataFormat::Float16_b}}, src1_cb_addr).set_page_size(src1_cb_index, single_tile_size);
CBHandle cb_src1 = v0::CreateCircularBuffer(program, core, cb_src1_config);
constexpr uint32_t output_cb_index = CB::c_out0;
constexpr uint32_t output_cb_index = CBIndex::c_16;
constexpr uint32_t output_cb_addr = 400 * 1024;
constexpr uint32_t num_output_tiles = 2;
constexpr uint32_t input_cb_size = num_input_tiles * single_tile_size;
Expand Down
4 changes: 2 additions & 2 deletions docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ compute, and writer engines.

.. code-block:: cpp
constexpr uint32_t src0_cb_index = CB::c_in0;
constexpr uint32_t src0_cb_index = CBIndex::c_0;
constexpr uint32_t num_input_tiles = 2;
CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
CBHandle cb_src0 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src0_config);
constexpr uint32_t output_cb_index = CB::c_out0;
constexpr uint32_t output_cb_index = CBIndex::c_16;
constexpr uint32_t num_output_tiles = 2;
CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}).set_page_size(output_cb_index, single_tile_size);
CBHandle cb_output = tt_metal::v0::CreateCircularBuffer(program, core, cb_output_config);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ In addition to our double-buffer config, we introduce a third circular buffer de

.. code-block:: cpp
uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
uint32_t interm0_cb_index = 24; // Index for the intermediate circular buffer
uint32_t output_cb_index = CBIndex::c_16;
uint32_t interm0_cb_index = CBIndex::c_24; // Index for the intermediate circular buffer
std::map<uint8_t, tt::DataFormat> output_cb_data_format_spec {
{output_cb_index, cb_data_format}, // Output buffer configuration
{interm0_cb_index, cb_data_format} // Intermediate buffer configuration
Expand Down Expand Up @@ -173,16 +173,16 @@ a. **Preparing the Intermediate Buffer**:

.. code-block:: cpp
cb_reserve_back(tt::CB::c_intermed0, out_subblock_num_tiles);
cb_reserve_back(tt::CBIndex::c_24, out_subblock_num_tiles);
- **Storing Partial Results**: Partial results are stored via a packing mechanism with ``pack_tile(...)`` into the above reserved space.

.. code-block:: cpp
for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
pack_tile(i, tt::CB::c_intermed0);
pack_tile(i, tt::CBIndex::c_24);
}
cb_push_back(tt::CB::c_intermed0, out_subblock_num_tiles);
cb_push_back(tt::CBIndex::c_24, out_subblock_num_tiles);
b. **Computing with Partial Results**:

Expand All @@ -191,11 +191,11 @@ b. **Computing with Partial Results**:
.. code-block:: cpp
if (enable_reload) {
cb_wait_front(tt::CB::c_intermed0, out_subblock_num_tiles);
cb_wait_front(tt::CBIndex::c_24, out_subblock_num_tiles);
for (uint32_t i = 0; i < out_subblock_num_tiles; i++) {
copy_tile(tt::CB::c_intermed0, i, i);
copy_tile(tt::CBIndex::c_24, i, i);
}
cb_pop_front(tt::CB::c_intermed0, out_subblock_num_tiles);
cb_pop_front(tt::CBIndex::c_24, out_subblock_num_tiles);
}
- **Execution with `matmul_tiles`**: Now we are ready to compute partial results and integrate them back into the computation stream (or for the last block of computation, culminate our data reuse to produce the final output tensor). We call the ``matmul_tiles(...)`` function to execute our matmul on the core's subblocks of tiles.
Expand All @@ -211,7 +211,7 @@ b. **Computing with Partial Results**:
for (uint32_t inner_dim = 0; inner_dim < in0_block_w; inner_dim++) {
int in0_index = in0_index_subblock_offset + in0_index_h_offset + inner_dim;
int in1_index = in1_index_subblock_offset + in1_index_inner_dim_offset + w;
matmul_tiles(tt::CB::c_in0, tt::CB::c_in1, in0_index, in1_index, dst_index, false /* transpose */);
matmul_tiles(tt::CBIndex::c_0, tt::CBIndex::c_1, in0_index, in1_index, dst_index, false /* transpose */);
in1_index_inner_dim_offset += in1_per_core_w;
}
dst_index++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,18 +147,18 @@ double buffering..

.. code-block:: cpp
uint32_t src0_cb_index = CB::c_in0; //0
uint32_t src0_cb_index = CBIndex::c_0; //0
uint32_t num_input_tiles = 2;
tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}})
.set_page_size(src0_cb_index, single_tile_size);
auto cb_src0 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src0_config);
uint32_t src1_cb_index = CB::c_in1; // 1
uint32_t src1_cb_index = CBIndex::c_1; // 1
tt_metal::CircularBufferConfig cb_src1_config = tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, cb_data_format}})
.set_page_size(src1_cb_index, single_tile_size);
auto cb_src1 = tt_metal::v0::CreateCircularBuffer(program, core, cb_src1_config);
uint32_t output_cb_index = CB::c_out0; // output operands start at index 16
uint32_t output_cb_index = tt::CBIndex::c_16;
uint32_t num_output_tiles = 2;
tt_metal::CircularBufferConfig cb_output_config = tt_metal::CircularBufferConfig(num_output_tiles * single_tile_size, {{output_cb_index, cb_data_format}})
.set_page_size(output_cb_index, single_tile_size);
Expand Down
2 changes: 1 addition & 1 deletion models/demos/distilbert/tests/test_perf_distilbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def test_distilbert_perf_device(batch_size, test, reset_seeds):
margin = 0.03
num_iterations = 1
if is_grayskull():
expected_perf = 40.8772
expected_perf = 57.3
elif is_wormhole_b0():
expected_perf = 103.884

Expand Down
Loading

0 comments on commit 92a5fe5

Please sign in to comment.