Skip to content

Commit

Permalink
#15809: Add SW Support for Coordinate Virtualization on WH
Browse files Browse the repository at this point in the history
  - Only Logical and Virtual Coordinates are now exposed to users
  - Tensix and Ethernet can be virtualized on WH. Users can only
    access DRAM controller end-points through Logical Coordinates/
    Bank IDs. Logical to Physical Translation for DRAM is done on device
    through the get_noc_addr_from_bank_id API
  - GS does not support Virtual Coordinates. BH will support this feature
    in future, but is currently blocked by Syseng. For these archs, Virtual
    and Physical coordinates are identical. Hence, on BH, users temporarily
    have access to Physical Coordinates
  - APIs returning Physical Coordinates have either been deprecated or migrated
    to return Virtual Coordinates instead
  - Modified Host/Device Runtime, TTNN Ops, Debug Tools and tests to support
    Virtual Coordinates
  - Unlocks offline compilation and Fast Dispatch Command Generation + Program
    Reuse and broadcasts (key features for TT-Mesh)
  • Loading branch information
tt-asaigal committed Dec 10, 2024
1 parent ff50e72 commit 707e366
Show file tree
Hide file tree
Showing 226 changed files with 3,010 additions and 3,869 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/all-static-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Check kernel count in base metal is less than maximum
run: if (( $(find tt_metal/kernels/ -type f | wc -l) > 7 )); then exit 1; fi
run: if (( $(find tt_metal/kernels/ -type f | wc -l) > 8 )); then exit 1; fi
check-doc:
runs-on: ubuntu-latest
steps:
Expand Down
36 changes: 18 additions & 18 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -341,13 +341,13 @@ Breakpoint 1, tt::tt_metal::Device::Device (this=0x3c, device_id=21845, num_hw_c
TT_METAL_WATCHER=10 ./your_program
...
Always | WARNING | Watcher detected NOC error and stopped device: bad alignment in NOC transaction.
Always | WARNING | Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010]
Always | WARNING | Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010]
Always | INFO | Last waypoint: NARW, W, W, W, W
Always | INFO | While running kernels:
Always | INFO | brisc : tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp
Always | INFO | ncrisc: blank
Always | INFO | triscs: blank
Test | INFO | Reported error: Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010]
Test | INFO | Reported error: Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010]
Always | FATAL | Watcher detected NOC error and stopped device: bad alignment in NOC transaction.
```
- If no such error is reported, but the program is hanging, check the watcher log generated in `generated/watcher/watcher.log`. There is a legend at the top of the log showing how to interpret it, and a sample portion of a log is shown below:
Expand All @@ -371,22 +371,22 @@ Legend:
k_ids:<brisc id>|<ncrisc id>|<trisc id> (ID map to file at end of section)
...
Dump #7 at 8.992s
Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 1,y= 0) phys(x= 2,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 2,y= 0) phys(x= 3,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 3,y= 0) phys(x= 4,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 4,y= 0) phys(x= 6,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 5,y= 0) phys(x= 7,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 6,y= 0) phys(x= 8,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 7,y= 0) phys(x= 9,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 0,y= 7) phys(x= 1,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0
Device 0 worker core(x= 1,y= 7) phys(x= 2,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0
Device 0 worker core(x= 2,y= 7) phys(x= 3,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|3|0
Device 0 worker core(x= 3,y= 7) phys(x= 4,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|7|0
Device 0 worker core(x= 4,y= 7) phys(x= 6,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:4|0|0
Device 0 worker core(x= 5,y= 7) phys(x= 7,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:6|0|0
Device 0 worker core(x= 6,y= 7) phys(x= 8,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 7) phys(x= 9,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 1,y= 0) virtual(x= 2,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 2,y= 0) virtual(x= 3,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 3,y= 0) virtual(x= 4,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 4,y= 0) virtual(x= 6,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 5,y= 0) virtual(x= 7,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 6,y= 0) virtual(x= 8,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 7,y= 0) virtual(x= 9,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15
Device 0 worker core(x= 0,y= 7) virtual(x= 1,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0
Device 0 worker core(x= 1,y= 7) virtual(x= 2,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0
Device 0 worker core(x= 2,y= 7) virtual(x= 3,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|3|0
Device 0 worker core(x= 3,y= 7) virtual(x= 4,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|7|0
Device 0 worker core(x= 4,y= 7) virtual(x= 6,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:4|0|0
Device 0 worker core(x= 5,y= 7) virtual(x= 7,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:6|0|0
Device 0 worker core(x= 6,y= 7) virtual(x= 8,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
Device 0 worker core(x= 7,y= 7) virtual(x= 9,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0
k_id[0]: blank
k_id[1]: tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
k_id[2]: tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Expand Down
2 changes: 1 addition & 1 deletion docs/source/tt-metalium/tools/watcher.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ per RISC in the log. If a stack overflow is detected, the core will hang and an

.. code-block::
Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): GW, W, W, W, W rmsg:D1D|BNt smsg:DDDD k_ids:11|10|0
Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): GW, W, W, W, W rmsg:D1D|BNt smsg:DDDD k_ids:11|10|0
brisc stack usage: 228/768, kernel using most stack: ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_gn_v2.cpp
ncrisc stack usage: 192/768, kernel using most stack: ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reader_unary_sharded_blocks_interleaved_start_id.cpp
trisc0 stack usage: 252/320, kernel using most stack: ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/compute/groupnorm_sharded_v2.cpp
Expand Down
9 changes: 5 additions & 4 deletions docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ Let's make the input and output DRAM buffers.
Buffer output_dram_buffer = CreateBuffer(dram_config);
const uint32_t output_dram_buffer_addr = output_dram_buffer.address();
const uint32_t input_bank_id = 0;
const uint32_t output_bank_id = 0;
Sending real data into DRAM
---------------------------

Expand All @@ -134,11 +137,9 @@ Setting runtime arguments for the data movement kernel
const std::vector<uint32_t> runtime_args = {
l1_buffer.address(),
input_dram_buffer.address(),
static_cast<uint32_t>(input_dram_buffer.noc_coordinates().x),
static_cast<uint32_t>(input_dram_buffer.noc_coordinates().y),
input_bank_id,
output_dram_buffer.address(),
static_cast<uint32_t>(output_dram_buffer.noc_coordinates().x),
static_cast<uint32_t>(output_dram_buffer.noc_coordinates().y),
output_bank_id,
l1_buffer.size()
};
Expand Down
3 changes: 1 addition & 2 deletions docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,7 @@ Extra runtime arguments for reader/writer
core,
{
dst_dram_buffer.address(),
static_cast<uint32_t>(dst_dram_buffer.noc_coordinates().x),
static_cast<uint32_t>(dst_dram_buffer.noc_coordinates().y),
dst_bank_id,
num_tiles
}
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,18 @@ tt_metal::InterleavedBufferConfig dram_config{
.page_size = single_tile_size,
.buffer_type = tt_metal::BufferType::DRAM
};
uint32_t src0_bank_id = 0;
uint32_t src1_bank_id = 0;
uint32_t dst_bank_id = 0;
```

5. Define the tile size to fit BFloat16 values:
5. Allocate memory for each buffer:
```std::shared_ptr<tt::tt_metal::Buffer> src0_dram_buffer = CreateBuffer(dram_config);
std::shared_ptr<tt::tt_metal::Buffer> src1_dram_buffer = CreateBuffer(dram_config);
std::shared_ptr<tt::tt_metal::Buffer> dst_dram_buffer = CreateBuffer(dram_config);
```

6.Allocate memory for each buffer:
```auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates();
auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates();
auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates();
uint32_t src0_dram_noc_x = src0_dram_noc_coord.x;
uint32_t src0_dram_noc_y = src0_dram_noc_coord.y;
uint32_t src1_dram_noc_x = src1_dram_noc_coord.x;
uint32_t src1_dram_noc_y = src1_dram_noc_coord.y;
uint32_t dst_dram_noc_x = dst_dram_noc_coord.x;
uint32_t dst_dram_noc_y = dst_dram_noc_coord.y;
```

7. Specify NoC Coordinates:
6. Create circular buffers and assign them to the program:
```constexpr uint32_t src0_cb_index = CB::c_in0;
constexpr uint32_t num_input_tiles = 1;
CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size);
Expand All @@ -63,7 +54,7 @@ CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles *
CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);
```

8. Create a data movement kernel:
7. Create a data movement kernel:
```KernelHandle binary_reader_kernel_id = CreateKernel(
program,
"tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp",
Expand All @@ -77,7 +68,7 @@ KernelHandle unary_writer_kernel_id = CreateKernel(
DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});
```

9. Create a compute kernel:
8. Create a compute kernel:
```vector<uint32_t> compute_kernel_args = {};
KernelHandle eltwise_binary_kernel_id = CreateKernel(
program,
Expand All @@ -92,7 +83,7 @@ KernelHandle eltwise_binary_kernel_id = CreateKernel(
);
```

10. Create two source vectors:
9. Create two source vectors:
```std::vector<uint32_t> src0_vec;
std::vector<uint32_t> src1_vec;
src0_vec = create_constant_vector_of_bfloat16(single_tile_size, 14.0f);
Expand All @@ -102,16 +93,16 @@ EnqueueWriteBuffer(cq, src0_dram_buffer, src0_vec, false);
EnqueueWriteBuffer(cq, src1_dram_buffer, src1_vec, false);
```

11. Setup corresponding runtime arguments:
```SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_dram_noc_x, src0_dram_noc_y, src1_dram_noc_x, src1_dram_noc_y});
10. Setup corresponding runtime arguments:
```SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_bank_id, src1_bank_id, dst_bank_id});
SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {});
SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_dram_noc_x, dst_dram_noc_y});
EnqueueProgram(cq, program, false);
Finish(cq);
```

12. Execute the Program:
11. Execute the Program:
```uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0);
uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1);
Expand All @@ -129,7 +120,7 @@ noc_async_read_barrier();
cb_push_back(cb_id_in1, 1);
```

13. Unpack, compute, and pack the data:
12. Unpack, compute, and pack the data:
```binary_op_init_common(cb_in0, cb_in1, cb_out0);
add_tiles_init();
Expand All @@ -153,8 +144,8 @@ cb_pop_front(cb_in1, 1);
cb_push_back(cb_out0, 1);
```

14. Write integer values to the DRAM:
```uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr);
13. Write integer values to the DRAM:
```uint64_t dst_noc_addr = get_noc_addr_from_bank_id<true>(dst_bank_id, dst_dram);
constexpr uint32_t cb_id_out0 = tt::CB::c_out0;
uint32_t ublock_size_bytes = get_tile_size(cb_id_out0);
Expand All @@ -166,6 +157,6 @@ noc_async_write_barrier();
cb_pop_front(cb_id_out0, 1);
```

15. Close the device:
14. Close the device:
```CloseDevice(device);
```
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,12 @@ std::shared_ptr<tt::tt_metal::Buffer> dst_dram_buffer = CreateBuffer(dram_config
Next, we allocate memory for each buffer with the specified configuration for each of the input vectors and another buffer for the output vector. The source data will be sent to the corresponding DRAM buffers to be accessed by the cores, and the results of the computation will be sent to the DRAM to be read by the destination vector.

``` cpp
auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates();
auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates();
auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates();
uint32_t src0_dram_noc_x = src0_dram_noc_coord.x;
uint32_t src0_dram_noc_y = src0_dram_noc_coord.y;
uint32_t src1_dram_noc_x = src1_dram_noc_coord.x;
uint32_t src1_dram_noc_y = src1_dram_noc_coord.y;
uint32_t dst_dram_noc_x = dst_dram_noc_coord.x;
uint32_t dst_dram_noc_y = dst_dram_noc_coord.y;
uint32_t src0_bank_id = 0;
uint32_t src1_bank_id = 0;
uint32_t dst_bank_id = 0;
```

For this example, we will also specify the NoC coordinates to pass into the kernel functions as runtime arguments. We will use this to ensure that the kernels will access the data at the correct NoC addresses.
For this example, we will also specify the Buffer Bank IDs to pass into the kernel functions as runtime arguments. We will use this to ensure that the kernels will access the data from the correct DRAM Memory Banks corresponding to each buffer.

``` cpp
constexpr uint32_t src0_cb_index = CBIndex::c_0;
Expand Down Expand Up @@ -129,9 +123,9 @@ EnqueueWriteBuffer(cq, src1_dram_buffer, src1_vec, false);
Next, we create two source vectors, each loaded with a constant value, before queueing the command to feed it to the corresponding DRAM buffers using `EnqueueWriteBuffer`.

``` cpp
SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_dram_noc_x, src0_dram_noc_y, src1_dram_noc_x, src1_dram_noc_y});
SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_bank_id, src1_bank_id});
SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {});
SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_dram_noc_x, dst_dram_noc_y});
SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id});

EnqueueProgram(cq, program, false);
Finish(cq);
Expand Down Expand Up @@ -192,7 +186,7 @@ In the compute kernel, a single tile is read from each of the circular buffers c
## Writer kernel function
``` cpp
uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr);
uint64_t dst_noc_addr = get_noc_addr_from_bank_id<true>(dst_bank_id, dst_dram);
constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16;
uint32_t ublock_size_bytes = get_tile_size(cb_id_out0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ In this example, we are using data movement processors for basic computation. As
## Configure and execute program
``` cpp
SetRuntimeArgs(program, binary_reader_kernel_id, core, {src0_dram_buffer->address(), src1_dram_buffer->address(), dst_dram_buffer->address(),});
SetRuntimeArgs(program, binary_reader_kernel_id, core, {src0_dram_buffer->address(), src1_dram_buffer->address(), dst_dram_buffer->address(), src0_bank_id, src1_bank_id, dst_bank_id});
EnqueueProgram(cq, program, false);
Finish(cq);
Expand All @@ -100,9 +100,9 @@ In order to execute the program, we need to load the runtime arguments for the k

``` cpp
// NoC coords (x,y) depending on DRAM location on-chip
uint64_t src0_dram_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src0_dram);
uint64_t src1_dram_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_dram);
uint64_t dst_dram_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_dram);
uint64_t src0_dram_noc_addr = get_noc_addr_from_bank_id<true>(src0_bank_id, src0_dram);
uint64_t src1_dram_noc_addr = get_noc_addr_from_bank_id<true>(src1_bank_id, src1_dram);
uint64_t dst_dram_noc_addr = get_noc_addr_from_bank_id<true>(dst_bank_id, dst_dram);

constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; // index=0
constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; // index=1
Expand Down
10 changes: 4 additions & 6 deletions tech_reports/prog_examples/dram_loopback/dram_loopback.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,9 @@ We use a non-blocking call so we can continue setting up our program.
const std::vector<uint32_t> runtime_args = {
l1_buffer.address(),
input_dram_buffer.address(),
static_cast<uint32_t>(input_dram_buffer.noc_coordinates().x),
static_cast<uint32_t>(input_dram_buffer.noc_coordinates().y),
input_bank_id,
output_dram_buffer.address(),
static_cast<uint32_t>(output_dram_buffer.noc_coordinates().x),
static_cast<uint32_t>(output_dram_buffer.noc_coordinates().y),
output_bank_id,
l1_buffer.size()
};

Expand All @@ -131,9 +129,9 @@ particular kernel, we have to provide:

- Where the L1 buffer starts (memory address)
- Where the input DRAM buffer starts (memory address)
- The location of the input DRAM buffer\'s channel on the NOC
- The Bank ID of the input DRAM buffer
- Where the output DRAM buffer starts (memory address)
- The location of the output DRAM buffer\'s channel on the NOC
- The Bank ID of the output DRAM buffer
- The size of the buffers

## Running the program
Expand Down
3 changes: 1 addition & 2 deletions tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,7 @@ SetRuntimeArgs(
core,
{
dst_dram_buffer.address(),
static_cast<uint32_t>(dst_dram_buffer.noc_coordinates().x),
static_cast<uint32_t>(dst_dram_buffer.noc_coordinates().y),
dst_bank_id,
num_tiles
}
);
Expand Down
2 changes: 1 addition & 1 deletion tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void generate_bcast_scaler() {
void kernel_main() {
uint32_t src_addr = get_arg_val<uint32_t>(0);
uint32_t num_tiles =
get_arg_val<uint32_t>(3); // same arg index as in reader_unary and in reader_unary_transpose_wh_8bank
get_arg_val<uint32_t>(2); // same arg index as in reader_unary and in reader_unary_transpose_wh_8bank

constexpr uint32_t cb_id_in0 = 0, cb_id_in1 = 1;

Expand Down
11 changes: 5 additions & 6 deletions tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
#include "dataflow_api.h"

void kernel_main() {
uint32_t src_addr = get_arg_val<uint32_t>(0);
uint32_t src_noc_x = get_arg_val<uint32_t>(1);
uint32_t src_noc_y = get_arg_val<uint32_t>(2);
uint32_t num_tiles = get_arg_val<uint32_t>(3);
uint32_t src_addr = get_arg_val<uint32_t>(0);
uint32_t bank_id = get_arg_val<uint32_t>(1);
uint32_t num_tiles = get_arg_val<uint32_t>(2);

constexpr uint32_t cb_id_in0 = 0;

Expand All @@ -18,8 +17,8 @@ void kernel_main() {
uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles;

// read a ublock of tiles from src to CB, and then push the ublock to unpacker
for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) {
uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr);
for (uint32_t i = 0; i<num_tiles; i += ublock_size_tiles) {
uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id<true>(bank_id, src_addr);

cb_reserve_back(cb_id_in0, ublock_size_tiles);
uint32_t l1_write_addr = get_write_ptr(cb_id_in0);
Expand Down
4 changes: 2 additions & 2 deletions tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#include "dataflow_api.h"

void kernel_main() {
uint32_t dst_addr = get_arg_val<uint32_t>(0);
uint32_t num_tiles = get_arg_val<uint32_t>(3); // Index 3 to match with regular writer_unary
uint32_t dst_addr = get_arg_val<uint32_t>(0);
uint32_t num_tiles = get_arg_val<uint32_t>(2); // Index 2 to match with regular writer_unary

constexpr uint32_t cb_id_out0 = 16;
constexpr uint32_t onetile = 1;
Expand Down
Loading

0 comments on commit 707e366

Please sign in to comment.