diff --git a/tests/scripts/run_moreh_microbenchmark.sh b/tests/scripts/run_moreh_microbenchmark.sh index cdccd2f8302..2b7107bb7df 100755 --- a/tests/scripts/run_moreh_microbenchmark.sh +++ b/tests/scripts/run_moreh_microbenchmark.sh @@ -35,6 +35,7 @@ run_profiling_test() { if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME + pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME fi # bypass wh_b0 for now until we can move FD cores to last col if [[ "$ARCH_NAME" != "wormhole_b0" ]]; then diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py index dc1e3b9b4c9..c93b82c45d6 100755 --- a/tests/scripts/test_moreh_microbenchmark.py +++ b/tests/scripts/test_moreh_microbenchmark.py @@ -287,6 +287,33 @@ def run_dram_read_l1_write_cmd(k, n, num_blocks, df, num_banks, bank_start_id): run_moreh_single_test("DRAM BW test multi-core", command) +def run_dram_read_remote_cb_sync_cmd( + k, n, num_blocks, cb_num_blocks, cb_padding, df, num_receivers, num_mixed_df_layers +): + command = ( + "TT_METAL_DEVICE_PROFILER=1 ./build/test/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb " + + " --k " + + str(k) + + " --n " + + str(n) + + " --num-blocks " + + str(num_blocks) + + " --cb-num-blocks " + + str(cb_num_blocks) + + " --cb-padding " + + str(cb_padding) + + " --num-tests " + + str(1) + + " --data-type " + + str(df) + + " --num-receivers " + + str(num_receivers) + + " --num-mixed-df-layers " + + str(num_mixed_df_layers) + ) + run_moreh_single_test("DRAM read remote CB sync single-core ", command) + + # noc def test_noc_local(r=9, c=12, nt=256, cb=1): command = ( @@ -739,6 +766,64 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat assert bw_bound <= throughput +@pytest.mark.parametrize( + "arch, freq, test_vector, num_tests, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers", + [ + # single layer single receiver test + ("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1), + # single layer multi receiver test + ("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1), + # multi layer multi receiver test + ("wormhole_b0", 1000, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15), + ], +) +def test_dram_read_remote_cb_sync( + arch, freq, test_vector, num_tests, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers +): + data = [] + cycle_list = [] + time_list = [] + throughput_list = [] + for _ in range(num_tests): + k = int(test_vector[0]) + n = int(test_vector[1]) + input_size = 0 + if data_format == 0: + input_size += k * n * 1088 // 1024 + elif data_format == 1: + input_size += k * n * 2048 // 1024 + for i in range(num_mixed_df_layers - 1): + if i % 2 == 0: + input_size += k * n * 1088 // 1024 + else: + input_size += k * n * 2048 // 1024 + run_dram_read_remote_cb_sync_cmd( + k, n, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers + ) + cycle = profile_results_kernel_duration() + time = cycle / freq / 1000.0 / 1000.0 + throughput = input_size / cycle * freq / 1000.0 + cycle_list.append(cycle) + time_list.append(time) + throughput_list.append(throughput) + cycle = sum(cycle_list) / len(cycle_list) + time = sum(time_list) / len(time_list) + throughput = sum(throughput_list) / len(throughput_list) + logger.info("DRAM read cycle: " + str(cycle)) + logger.info("DRAM read time: " + str(time)) + logger.info("DRAM read throughput: " + str(throughput)) + data.append([throughput]) + # check within range + dev_freq = get_device_freq() + if arch == "grayskull": + bw_bound = 100.0 + elif arch == "wormhole_b0": + bw_bound = 22.0 + elif arch == "blackhole": + bw_bound = 340.0 + assert bw_bound <= throughput + + @pytest.mark.parametrize( "arch, freq, r, c, test_vector_global, test_vector_local", [ diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp new file mode 100644 index 00000000000..9b5988f4e63 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp @@ -0,0 +1,145 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" +#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp" + +#include "debug/dprint.h" + +template +FORCE_INLINE +void noc_async_read_tile_dram_sharded(uint32_t src_addr, uint32_t dest_addr, uint32_t bank_id = 0, const uint32_t vc = 0) { + uint32_t src_addr_; + uint32_t src_noc_xy; + + src_addr_ = src_addr + bank_base_address; + src_addr_ += bank_to_dram_offset[bank_id]; + src_noc_xy = dram_bank_to_noc_xy[noc_index][bank_id]; + + WAYPOINT("NRTW"); + DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_index, get_noc_addr_helper(src_noc_xy, src_addr_), dest_addr, page_size); + while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF)); + WAYPOINT("NRTD"); + + if constexpr(use_vc) { + uint32_t noc_rd_cmd_field = NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc); + NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field); + } + + NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_RET_ADDR_LO, dest_addr); + NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_LO, src_addr_); // (uint32_t)src_addr + NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 + NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE, page_size); // len_bytes + NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); + noc_reads_num_issued[noc_index] += 1; +} + +void kernel_main() { + constexpr uint32_t input_addr = get_compile_time_arg_val(0); + constexpr uint32_t input_start_tile_id = get_compile_time_arg_val(1); + constexpr uint32_t noc = get_compile_time_arg_val(2); + constexpr uint32_t num_layers = get_compile_time_arg_val(3); + + uint32_t rt_args_idx = 0; + const uint32_t bank_id = get_arg_val(rt_args_idx++); + const uint32_t vc = get_arg_val(rt_args_idx++); + tt_l1_ptr uint32_t* page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + tt_l1_ptr uint32_t* num_pages = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + tt_l1_ptr uint32_t* num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + tt_l1_ptr uint32_t* block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + + constexpr uint32_t cb_id = 0; + constexpr uint32_t total_num_blocks_in_buffer = 3; + + uint32_t block_size_bytes = num_pages[0] * page_size[0]; + uint32_t l1_buffer_start_addr = get_write_ptr(cb_id); + uint32_t l1_buffer_end_addr = get_write_ptr(cb_id) + block_size_bytes * total_num_blocks_in_buffer; + + uint32_t src_read_addr = 0; + uint32_t src_read_addr_offset_bytes = 0; + + for (uint32_t l = 0; l < num_layers; ++l) { + uint32_t curr_page_size = page_size[l]; + uint32_t curr_num_pages = num_pages[l]; + uint32_t curr_num_blocks = num_blocks[l]; + uint32_t curr_block_num_tiles = block_num_tiles[l]; + + uint32_t curr_block_size_bytes = curr_num_pages * curr_page_size; + uint32_t curr_layer_size_bytes = curr_num_blocks * curr_block_size_bytes; + + uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, curr_page_size, bank_id, vc); + src_read_addr = src_read_addr_offset_bytes; + + // For debug purpose, use trivial DRAM read method + // for (uint32_t block = 0; block < curr_num_blocks; ++block) { + // // Operand 1 + // cb_reserve_back(cb_id, curr_block_num_tiles); + // auto l1_write_addr = get_write_ptr(cb_id); + + // for (uint32_t h = 0; h < curr_num_pages; ++h) { + // noc_async_read_tile_dram_sharded_with_state(src_base_addr, src_read_addr, l1_write_addr); + // src_read_addr += curr_page_size; + // l1_write_addr += curr_page_size; + // } + + // noc_async_read_barrier(); + + // cb_push_back(cb_id, curr_block_num_tiles); + // } + + uint32_t num_free_blocks_in_buffer = total_num_blocks_in_buffer; + uint32_t curr_block_trid = 1; + uint32_t block_trid_to_wait = 1; + + cb_reserve_back(cb_id, curr_block_num_tiles); + uint32_t l1_write_addr_offset = 0; + uint32_t l1_write_addr_start = get_write_ptr(cb_id); + if (l1_write_addr_start >= l1_buffer_end_addr) { + l1_write_addr_start = l1_buffer_start_addr; + } + uint32_t l1_write_addr = l1_write_addr_start; + for (uint32_t block = 0; block < curr_num_blocks; ++block) { + noc_async_read_tile_dram_sharded_set_trid(curr_block_trid); + + uint32_t temp_l1_write_addr = l1_write_addr; + for (uint32_t h = 0; h < curr_num_pages; ++h) { + noc_async_read_tile_dram_sharded_with_state_with_trid( + src_base_addr, src_read_addr, temp_l1_write_addr, curr_block_trid); + src_read_addr += curr_page_size; + temp_l1_write_addr += curr_page_size; + } + + if (num_free_blocks_in_buffer == 2) { + noc_async_read_barrier_with_trid(block_trid_to_wait); + cb_push_back(cb_id, curr_block_num_tiles); + // wait for next block trid + block_trid_to_wait = block_trid_to_wait == 3 ? 1 : (block_trid_to_wait + 1); + // reserve for next block + cb_reserve_back(cb_id, curr_block_num_tiles * 2); + } else { + num_free_blocks_in_buffer -= 1; + } + + if (curr_block_trid == total_num_blocks_in_buffer) { + curr_block_trid = 1; + } else { + curr_block_trid += 1; + } + + l1_write_addr += block_size_bytes; + if (l1_write_addr >= l1_buffer_end_addr) { + l1_write_addr = l1_buffer_start_addr; + } + } + // last block to wait + noc_async_read_barrier_with_trid(block_trid_to_wait); + cb_push_back(cb_id, curr_block_num_tiles); + + src_read_addr_offset_bytes += curr_layer_size_bytes; + + } + +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp new file mode 100644 index 00000000000..7e702916608 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp @@ -0,0 +1,164 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" +#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp" + +#include "debug/dprint.h" + +constexpr uint32_t ALIGNED_PAGE_SIZE = 16; + +constexpr uint32_t cb_start_addr = get_compile_time_arg_val(0); +constexpr uint32_t cb_rd_ptr = get_compile_time_arg_val(0); +constexpr uint32_t cb_size = get_compile_time_arg_val(1); +constexpr uint32_t num_layers = get_compile_time_arg_val(2); + +uint32_t rt_args_idx = 0; +uint32_t vc; +uint32_t noc_x; +uint32_t noc_y; +uint32_t pages_acked_semaphore_addr; +uint32_t pages_sent_semaphore_addr; +tt_l1_ptr uint32_t* page_size; +tt_l1_ptr uint32_t* num_blocks; +tt_l1_ptr uint32_t* block_num_tiles; + +uint32_t start_page_size; + +struct RemoteReceiverCBInterface { + volatile tt_l1_ptr uint32_t* pages_acked; + volatile tt_l1_ptr uint32_t* pages_sent; + + uint32_t fifo_size; + uint32_t fifo_limit; + uint32_t fifo_limit_page_aligned; + + uint32_t fifo_page_size; + uint32_t fifo_aligned_num_pages; + + uint32_t fifo_rd_ptr; + + uint32_t fifo_start_addr; + + uint32_t aligned_page_size; +}; + +RemoteReceiverCBInterface remote_cb_interface; + +template +FORCE_INLINE void setup_remote_receiver_cb_interface() { + uint32_t num_pages = cb_size / start_page_size; + uint32_t cb_size_page_aligned = num_pages * start_page_size; + + remote_cb_interface.fifo_size = cb_size; + remote_cb_interface.fifo_limit = cb_size + cb_start_addr; + remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + cb_start_addr; + + + remote_cb_interface.fifo_page_size = start_page_size; + remote_cb_interface.fifo_aligned_num_pages = num_pages * start_page_size / aligned_page_size; + + remote_cb_interface.fifo_rd_ptr = cb_rd_ptr; + + remote_cb_interface.fifo_start_addr = cb_start_addr; + + remote_cb_interface.pages_acked = reinterpret_cast(get_semaphore(pages_acked_semaphore_addr)); + remote_cb_interface.pages_sent = reinterpret_cast(get_semaphore(pages_sent_semaphore_addr)); + + remote_cb_interface.aligned_page_size = aligned_page_size; +} + +FORCE_INLINE void setup_remote_cb_page_size(uint32_t page_size, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) { + uint32_t num_pages = remote_cb_interface.fifo_size / page_size; + uint32_t cb_size_page_aligned = num_pages * page_size; + + remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + remote_cb_interface.fifo_start_addr; + remote_cb_interface.fifo_page_size = page_size; + remote_cb_interface.fifo_aligned_num_pages = num_pages * page_size / remote_cb_interface.aligned_page_size; + + uint32_t curr_fifo_rd_ptr = remote_cb_interface.fifo_rd_ptr; + bool fifo_rd_ptr_exceed_fifo_limit = curr_fifo_rd_ptr > remote_cb_interface.fifo_limit_page_aligned; + uint32_t num_pages_till_fifo_limit = (remote_cb_interface.fifo_limit_page_aligned - curr_fifo_rd_ptr) / page_size; + + if (fifo_rd_ptr_exceed_fifo_limit) { + remote_cb_interface.fifo_rd_ptr = remote_cb_interface.fifo_start_addr; + } else { + uint32_t next_fifo_rd_ptr = remote_cb_interface.fifo_limit_page_aligned - num_pages_till_fifo_limit * page_size; + uint32_t pages_acked = (next_fifo_rd_ptr - remote_cb_interface.fifo_rd_ptr) / remote_cb_interface.aligned_page_size; + remote_cb_interface.fifo_rd_ptr = next_fifo_rd_ptr; + + // increment the aligned pages acked because we skipped to next aligned page location + *remote_cb_interface.pages_acked += pages_acked; + uint64_t remote_ack_ptr_addr = get_noc_addr(remote_noc_x, remote_noc_y, (uint32_t)remote_cb_interface.pages_acked, noc); + noc_semaphore_inc(remote_ack_ptr_addr, pages_acked, noc); + } +} + +FORCE_INLINE void remote_cb_wait_front(uint32_t num_pages) { + uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size; + uint32_t num_pages_wait = len_bytes / remote_cb_interface.aligned_page_size; + volatile uint32_t num_pages_recv = 0; + uint32_t pages_acked = 0; + uint32_t pages_sent = 0; + + do { + + pages_acked = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_acked); + pages_sent = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_sent); + num_pages_recv = pages_sent - pages_acked; + } while (num_pages_recv < num_pages_wait); +} + +FORCE_INLINE void remote_cb_pop_front(uint32_t num_pages, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) { + uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size; + uint32_t num_aligned_pages = len_bytes / remote_cb_interface.aligned_page_size; + + *remote_cb_interface.pages_acked += num_aligned_pages; + remote_cb_interface.fifo_rd_ptr += len_bytes; + + if (remote_cb_interface.fifo_rd_ptr >= remote_cb_interface.fifo_limit_page_aligned) { + remote_cb_interface.fifo_rd_ptr = remote_cb_interface.fifo_start_addr; + } + + uint64_t remote_ack_ptr_addr = get_noc_addr(remote_noc_x, remote_noc_y, (uint32_t)remote_cb_interface.pages_acked, noc); + noc_semaphore_inc(remote_ack_ptr_addr, num_aligned_pages, noc); +} + + +void kernel_main() { + + uint32_t rt_args_idx = 0; + vc = get_arg_val(rt_args_idx++); + noc_x = get_arg_val(rt_args_idx++); + noc_y = get_arg_val(rt_args_idx++); + pages_acked_semaphore_addr = get_arg_val(rt_args_idx++); + pages_sent_semaphore_addr = get_arg_val(rt_args_idx++); + + page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + + start_page_size = page_size[0]; + + constexpr uint32_t cb_id = 0; + + setup_remote_receiver_cb_interface(); + + for (uint32_t l = 0; l < num_layers; ++l) { + uint32_t curr_page_size = page_size[l]; + uint32_t curr_num_blocks = num_blocks[l]; + uint32_t curr_block_num_tiles = block_num_tiles[l]; + + setup_remote_cb_page_size(curr_page_size, noc_x, noc_y); + + for (uint32_t block = 0; block < curr_num_blocks; ++block) { + remote_cb_wait_front(curr_block_num_tiles); + + remote_cb_pop_front(curr_block_num_tiles, noc_x, noc_y); + } + } + +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp new file mode 100644 index 00000000000..0fefcfbf9b1 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp @@ -0,0 +1,331 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" +#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp" + +#include "debug/dprint.h" + +constexpr uint32_t ALIGNED_PAGE_SIZE = 16; + +constexpr uint32_t noc = get_compile_time_arg_val(0); +constexpr uint32_t cb_start_addr = get_compile_time_arg_val(1); +constexpr uint32_t cb_wr_ptr = get_compile_time_arg_val(1); +constexpr uint32_t cb_size = get_compile_time_arg_val(2); +constexpr uint32_t num_receivers = get_compile_time_arg_val(3); +constexpr uint32_t num_layers = get_compile_time_arg_val(4); + +tt_l1_ptr uint32_t* noc_x; +tt_l1_ptr uint32_t* noc_y; +tt_l1_ptr uint32_t* pages_acked_semaphore_addr; +tt_l1_ptr uint32_t* pages_sent_semaphore_addr; +tt_l1_ptr uint32_t* coalesced_page_size; +tt_l1_ptr uint32_t* coalesced_num_pages; +tt_l1_ptr uint32_t* num_blocks; +tt_l1_ptr uint32_t* block_num_tiles; +tt_l1_ptr uint32_t* page_size; +tt_l1_ptr uint32_t* num_tile_rows; + +uint32_t start_page_size; +uint32_t layer = 0; + +template +struct RemoteSenderCBInterface { + uint32_t num_receivers; + + volatile tt_l1_ptr uint32_t* pages_acked[num_recv_cbs]; + volatile tt_l1_ptr uint32_t* pages_sent[num_recv_cbs]; + + uint32_t fifo_size; + uint32_t fifo_limit; + uint32_t fifo_limit_page_aligned; + + uint32_t fifo_page_size; + uint32_t fifo_aligned_num_pages; + + uint32_t fifo_wr_ptr; + + uint32_t fifo_start_addr; + + uint32_t aligned_page_size; +}; + +RemoteSenderCBInterface remote_cb_interface; + +template +FORCE_INLINE void setup_remote_sender_cb_interface() { + uint32_t num_pages = cb_size / start_page_size; + uint32_t cb_size_page_aligned = num_pages * start_page_size; + + remote_cb_interface.fifo_size = cb_size; + remote_cb_interface.fifo_limit = cb_size + cb_start_addr; + remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + cb_start_addr; + + remote_cb_interface.fifo_page_size = start_page_size; + remote_cb_interface.fifo_aligned_num_pages = num_pages * start_page_size / aligned_page_size; + + remote_cb_interface.fifo_wr_ptr = cb_wr_ptr; + + remote_cb_interface.fifo_start_addr = cb_start_addr; + + remote_cb_interface.num_receivers = num_receivers; + + for (uint32_t i=0; i < num_receivers; ++i) { + remote_cb_interface.pages_acked[i] = reinterpret_cast(get_semaphore(pages_acked_semaphore_addr[i])); + remote_cb_interface.pages_sent[i] = reinterpret_cast(get_semaphore(pages_sent_semaphore_addr[i])); + } + + remote_cb_interface.aligned_page_size = aligned_page_size; + +} + +FORCE_INLINE void setup_remote_cb_page_size(uint32_t page_size, uint32_t* remote_noc_x, uint32_t* remote_noc_y, uint8_t noc = noc_index) { + uint32_t num_pages = remote_cb_interface.fifo_size / page_size; + uint32_t cb_size_page_aligned = num_pages * page_size; + + remote_cb_interface.fifo_limit_page_aligned = cb_size_page_aligned + remote_cb_interface.fifo_start_addr; + remote_cb_interface.fifo_page_size = page_size; + remote_cb_interface.fifo_aligned_num_pages = num_pages * page_size / remote_cb_interface.aligned_page_size; + + uint32_t curr_fifo_wr_ptr = remote_cb_interface.fifo_wr_ptr; + bool fifo_wr_ptr_exceed_fifo_limit = curr_fifo_wr_ptr > remote_cb_interface.fifo_limit_page_aligned; + uint32_t num_pages_till_fifo_limit = (remote_cb_interface.fifo_limit_page_aligned - curr_fifo_wr_ptr) / page_size; + + if (fifo_wr_ptr_exceed_fifo_limit) { + remote_cb_interface.fifo_wr_ptr = remote_cb_interface.fifo_start_addr; + } else { + uint32_t next_fifo_wr_ptr = remote_cb_interface.fifo_limit_page_aligned - num_pages_till_fifo_limit * page_size; + uint32_t pages_sent = (next_fifo_wr_ptr - remote_cb_interface.fifo_wr_ptr) / remote_cb_interface.aligned_page_size; + remote_cb_interface.fifo_wr_ptr = next_fifo_wr_ptr; + + // increment the aligned pages sent because we skipped to next aligned page location + for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) { + uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x[i]), DYNAMIC_NOC_Y(noc, remote_noc_y[i]))); + *remote_cb_interface.pages_sent[i] += pages_sent; + uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent[i]); + noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc); + } + } +} + +FORCE_INLINE void remote_cb_reserve_back(uint32_t num_pages) { + uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size; + uint32_t num_pages_wait = len_bytes / remote_cb_interface.aligned_page_size; + uint32_t free_pages; + + for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) { + do { + uint32_t pages_acked = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_acked[0]); + uint32_t pages_sent = (uint32_t)reg_read((uint32_t)remote_cb_interface.pages_sent[0]); + free_pages = remote_cb_interface.fifo_aligned_num_pages - (pages_sent - pages_acked); + } while (free_pages < num_pages_wait); + } +} + +// unused for now, but we might need to use this one if we want to transfer the maximum noc packet +FORCE_INLINE void remote_cb_push_back_and_write_pages_(uint32_t local_cb_addr, uint32_t num_pages, uint32_t remote_noc_x, uint32_t remote_noc_y, uint8_t noc = noc_index) { + uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size; + uint32_t pages_sent = len_bytes / remote_cb_interface.aligned_page_size; + + uint32_t local_fifo_rd_ptr = local_cb_addr; + uint32_t remote_fifo_wr_ptr = remote_cb_interface.fifo_wr_ptr; + + uint32_t src_addr = local_cb_addr; + uint32_t dest_addr = remote_cb_interface.fifo_wr_ptr; + uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x), DYNAMIC_NOC_Y(noc, remote_noc_y))); + uint64_t dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + + + while (len_bytes > NOC_MAX_BURST_SIZE) { + + src_addr = local_fifo_rd_ptr; + dest_addr = remote_fifo_wr_ptr; + dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + + // split one write to two chunks + if ((dest_addr + NOC_MAX_BURST_SIZE) >= remote_cb_interface.fifo_limit_page_aligned) { + uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr; + uint32_t second_len_bytes = NOC_MAX_BURST_SIZE - first_len_bytes; + + // issue first write transfer + while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, first_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true); + src_addr += first_len_bytes; + dest_addr = remote_cb_interface.fifo_start_addr; + dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + + if (second_len_bytes != 0) { + // issue second write transfer + while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, second_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true); + src_addr += second_len_bytes; + dest_addr += second_len_bytes; + } + + } else { // issue write in one request + while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, NOC_MAX_BURST_SIZE, NOC_UNICAST_WRITE_VC, false, false, 1, true); + src_addr += NOC_MAX_BURST_SIZE; + dest_addr += NOC_MAX_BURST_SIZE; + } + + // update local and remote pointers + local_fifo_rd_ptr = src_addr; + remote_fifo_wr_ptr = dest_addr; + + len_bytes -= NOC_MAX_BURST_SIZE; + } + + dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + // split one write to two chunks for last write + if ((dest_addr + len_bytes) >= remote_cb_interface.fifo_limit_page_aligned) { + + uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr; + uint32_t second_len_bytes = len_bytes - first_len_bytes; + + // issue first write transfer + while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, first_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true); + src_addr += first_len_bytes; + dest_addr = remote_cb_interface.fifo_start_addr; + dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + + if (second_len_bytes != 0) { + // issue second write transfer + while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, second_len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true); + src_addr += second_len_bytes; + dest_addr += second_len_bytes; + } + + } else { // issue write in one request + while (!noc_cmd_buf_ready(noc, write_cmd_buf)); + ncrisc_noc_fast_write(noc, write_cmd_buf, src_addr, dest_noc_addr, len_bytes, NOC_UNICAST_WRITE_VC, false, false, 1, true); + src_addr += len_bytes; + dest_addr += len_bytes; + } + + *remote_cb_interface.pages_sent += pages_sent; + remote_cb_interface.fifo_wr_ptr = dest_addr; + + uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent); + noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc); +} + +FORCE_INLINE void remote_cb_push_back_and_write_pages(uint32_t local_cb_addr, uint32_t num_pages, uint32_t num_rows, uint32_t coalesced_num_pages_per_row, uint32_t coalesced_page_size, uint32_t* remote_noc_x, uint32_t* remote_noc_y, uint8_t noc = noc_index) { + uint32_t len_bytes = num_pages * remote_cb_interface.fifo_page_size; + uint32_t pages_sent = len_bytes / remote_cb_interface.aligned_page_size; + + uint32_t next_receiver_start_addr_stride = coalesced_num_pages_per_row * coalesced_page_size; + uint32_t next_block_row_stride = next_receiver_start_addr_stride * remote_cb_interface.num_receivers; + + uint32_t dest_addr; + + uint32_t next_receiver_start_addr_offset = 0; + for (uint32_t i=0; i < remote_cb_interface.num_receivers; ++i) { + + uint32_t src_addr = local_cb_addr + next_receiver_start_addr_offset; + dest_addr = remote_cb_interface.fifo_wr_ptr; + + uint32_t remote_noc_xy = uint32_t(NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_x[i]), DYNAMIC_NOC_Y(noc, remote_noc_y[i]))); + uint64_t dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + + noc_async_write_one_packet_set_state(dest_noc_addr, coalesced_page_size, noc); + + for (uint32_t h = 0; h < num_rows; ++h) { + uint32_t prev_src_addr = src_addr; + for (uint32_t w = 0; w < coalesced_num_pages_per_row; ++w) { + dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + + if ((dest_addr + coalesced_page_size) > remote_cb_interface.fifo_limit_page_aligned) { + + uint32_t first_len_bytes = remote_cb_interface.fifo_limit_page_aligned - dest_addr; + uint32_t second_len_bytes = coalesced_page_size - first_len_bytes; + + if (first_len_bytes != 0) { + noc_async_write_one_packet(src_addr, dest_noc_addr, first_len_bytes, noc); + src_addr += first_len_bytes; + } + + dest_addr = remote_cb_interface.fifo_start_addr; + dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + + noc_async_write_one_packet(src_addr, dest_noc_addr, second_len_bytes, noc); + + src_addr += second_len_bytes; + dest_addr += second_len_bytes; + dest_noc_addr = get_noc_addr_helper(remote_noc_xy, dest_addr); + + noc_async_write_one_packet_set_state(dest_noc_addr, coalesced_page_size, noc); + + } else { + noc_async_write_one_packet_with_state(src_addr, dest_noc_addr, noc); + + src_addr += coalesced_page_size; + dest_addr += coalesced_page_size; + } + } + src_addr = prev_src_addr + next_block_row_stride; + } + next_receiver_start_addr_offset += next_receiver_start_addr_stride; + + *remote_cb_interface.pages_sent[i] += pages_sent; + + uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)remote_cb_interface.pages_sent[i]); + noc_semaphore_inc(remote_ack_ptr_addr, pages_sent, noc); + } + + remote_cb_interface.fifo_wr_ptr = dest_addr; + +} + +void kernel_main() { + + uint32_t rt_args_idx = 0; + noc_x = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers))); + noc_y = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers))); + pages_acked_semaphore_addr = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers))); + pages_sent_semaphore_addr = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_receivers))); + + coalesced_page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + coalesced_num_pages = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + num_tile_rows = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers))); + + start_page_size = page_size[0]; + + constexpr uint32_t cb_id = 0; + + setup_remote_sender_cb_interface(); + + for (uint32_t l = 0; l < num_layers; ++l) { + uint32_t curr_coalesced_page_size = coalesced_page_size[l]; + uint32_t curr_coalesced_num_pages = coalesced_num_pages[l]; + uint32_t curr_num_blocks = num_blocks[l]; + uint32_t curr_block_num_tiles = block_num_tiles[l]; + uint32_t curr_page_size = page_size[l]; + uint32_t curr_num_tile_rows = num_tile_rows[l]; + uint32_t curr_receiver_block_num_tiles = curr_block_num_tiles / num_receivers; + + setup_remote_cb_page_size(curr_page_size, noc_x, noc_y, noc); + + for (uint32_t block = 0; block < curr_num_blocks; ++block) { + + cb_wait_front(cb_id, curr_block_num_tiles); + + uint32_t local_cb_addr = get_read_ptr(cb_id); + remote_cb_reserve_back(curr_receiver_block_num_tiles); + remote_cb_push_back_and_write_pages(local_cb_addr, curr_receiver_block_num_tiles, curr_num_tile_rows, curr_coalesced_num_pages, curr_coalesced_page_size, noc_x, noc_y, noc); + + cb_pop_front(cb_id, curr_block_num_tiles); + + } + layer++; + } + +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp new file mode 100644 index 00000000000..5bbf0ca25b0 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp @@ -0,0 +1,832 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/bfloat8.hpp" +#include "common/bfloat16.hpp" +#include "common/tt_backend_api_types.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/detail/util.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "tt_metal/common/work_split.hpp" +#include "tests/tt_metal/test_utils/tilization.hpp" +#include "tt_metal/test_utils/deprecated/tensor.hpp" +#include "tests/tt_metal/tt_metal/unit_tests_common/compute/matmul/matmul_utils.hpp" +#include + +using namespace tt; +using std::chrono::duration_cast; +using std::chrono::microseconds; + +//////////////////////////////////////////////////////////////////////////////// +// A tensix core that's next to a DRAM bank reads from the bank, and writes to +// the neighbour receiver tensix core. It creates a bfloat16/bfloat8_b format +// DRAM buffer of a given input size, and write it to the DRAM banks in the round +// robin style. +// +// Disclaimer: +// - This benchmark is designed to support an input size larger than 4GB. But +// current tt-metal does not seem to support buffer allocation larger than 4GB +// yet. +// - Also, detail::ReadFromBuffer API used in DRAM write test may take a long time if +// the input size is large. +// +// Usage example: +// ./test_dram_offchip +// --k +// --n +// --num-blocks +// --k +// --k +// --num-tests +// --data-type +// --num-banks +// --bank-start-id +// --bypass-check (set to bypass checking performance criteria fulfillment) +//////////////////////////////////////////////////////////////////////////////// + + + +template +std::vector slice_vec(std::vector const &v, int m, int n) { + auto first = v.cbegin() + m; + auto last = v.cbegin() + n + 1; + + std::vector vec(first, last); + return vec; +} + +void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t num_datums_per_tile, uint32_t& page_size, uint32_t& num_pages) { + uint64_t total_size = static_cast(num_tiles) * num_datums_per_tile; + + page_size = (8192 / num_datums_per_tile) * num_datums_per_tile; + while (total_size % page_size != 0 && page_size >= num_datums_per_tile) { + page_size -= num_datums_per_tile; + } + num_pages = total_size / page_size; +} + +std::tuple create_program( + tt_metal::Device *device, + const CoreRangeSet &dram_reader_core, + const CoreRangeSet &l1_receiver_cores, + const uint32_t &single_tile_size, + const tt::DataFormat &tile_format, + uint32_t k, + uint32_t n, + uint32_t num_blocks, + uint32_t cb_num_blocks, + uint32_t num_receivers, + uint32_t num_mixed_df_layers, + uint32_t cb_padding, + std::shared_ptr input_buffer, + std::shared_ptr output_buffer + ) { + + log_info("created program"); + + tt_metal::Program program = tt_metal::Program(); + + auto all_cores = dram_reader_core.merge(l1_receiver_cores); + + uint32_t start_tile_id = 0; + uint32_t kt = k / 32; + uint32_t nt = n / 32; + uint32_t block_h = kt / num_blocks; + uint32_t num_tile_rows_write = block_h; + uint32_t block_w = nt; + uint32_t block_num_tiles = block_h * block_w; + + // DRAM reader CB + uint32_t reader_cb_index = 0; + uint32_t reader_cb_size = block_h * block_w * single_tile_size * 3; + // For debug purpose + // uint32_t reader_cb_size = block_h * block_w * single_tile_size; + uint32_t reader_page_size, reader_num_pages; + get_max_page_size_and_num_pages(block_num_tiles, single_tile_size, reader_page_size, reader_num_pages); + + uint32_t receiver_block_num_tile = block_h * block_w / num_receivers; + uint32_t writer_page_size, writer_num_pages; + get_max_page_size_and_num_pages(block_w / num_receivers, single_tile_size, writer_page_size, writer_num_pages); + + log_info("writer_page_size: {}", writer_page_size); + log_info("writer_num_pages: {}", writer_num_pages); + + uint32_t reader_cb_addr = device->get_base_allocator_addr(HalMemType::L1); + tt_metal::CircularBufferConfig reader_cb_config = + tt_metal::CircularBufferConfig(reader_cb_size, {{reader_cb_index, tile_format}}) + .set_page_size(reader_cb_index, single_tile_size); + auto reader_cb = tt_metal::CreateCircularBuffer(program, dram_reader_core, reader_cb_config); + + // mixed cb dataformat + uint32_t next_layer_num_blocks = num_blocks * 2; + uint32_t next_layer_block_h = kt / next_layer_num_blocks; + uint32_t next_layer_block_num_tiles = next_layer_block_h * block_w; + uint32_t next_layer_num_tile_rows_write = next_layer_block_h; + uint32_t next_layer_receiver_block_num_tile = next_layer_block_num_tiles / num_receivers; + + uint32_t next_layer_single_tile_size = single_tile_size; + if (tile_format == tt::DataFormat::Float16_b) { + next_layer_single_tile_size = 1088; + } else { + next_layer_single_tile_size = 2048; + } + uint32_t next_layer_reader_page_size, next_layer_reader_num_pages; + get_max_page_size_and_num_pages(next_layer_block_num_tiles, next_layer_single_tile_size, next_layer_reader_page_size, next_layer_reader_num_pages); + + uint32_t next_layer_writer_page_size, next_layer_writer_num_pages; + get_max_page_size_and_num_pages(block_w / num_receivers, next_layer_single_tile_size, next_layer_writer_page_size, next_layer_writer_num_pages); + + // L1 receiver CB + uint32_t receiver_cb_index = 0; + uint32_t receiver_cb_size = block_h * block_w * single_tile_size * cb_num_blocks / num_receivers + cb_padding; + uint32_t receiver_page_size = 32; + uint32_t receiver_cb_addr = output_buffer->address(); + tt_metal::CircularBufferConfig receiver_cb_config = + tt_metal::CircularBufferConfig(receiver_cb_size, {{receiver_cb_index, tile_format}}) + .set_page_size(receiver_cb_index, receiver_page_size).set_globally_allocated_address(*output_buffer); + auto receiver_cb = tt_metal::CreateCircularBuffer(program, l1_receiver_cores, receiver_cb_config); + + log_info("reader_cb_size: {}", reader_cb_size); + log_info("receiver_cb_size: {}", receiver_cb_size); + + // semaphore + std::vector pages_acked_semaphore_ids(num_receivers); + std::vector pages_sent_semaphore_ids(num_receivers); + for (uint32_t i=0; i < num_receivers; ++i) { + pages_acked_semaphore_ids[i] = tt_metal::CreateSemaphore(program, all_cores, INVALID); + pages_sent_semaphore_ids[i] = tt_metal::CreateSemaphore(program, all_cores, INVALID); + } + + std::vector reader_compile_time_args = { + (std::uint32_t) input_buffer->address(), + (std::uint32_t) start_tile_id, + (std::uint32_t) tt_metal::NOC::RISCV_0_default, + (std::uint32_t) num_mixed_df_layers + }; + + auto reader_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/reader_dram.cpp", + dram_reader_core, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, + .noc = tt_metal::NOC::RISCV_0_default, + .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, + .compile_args = reader_compile_time_args}); + + std::vector writer_compile_time_args = { + (std::uint32_t) tt_metal::NOC::RISCV_0_default, + (std::uint32_t) receiver_cb_addr, + (std::uint32_t) receiver_cb_size, + (std::uint32_t) num_receivers, + (std::uint32_t) num_mixed_df_layers + }; + + auto writer_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/writer_l1.cpp", + dram_reader_core, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, + .noc = tt_metal::NOC::RISCV_1_default, + .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, + .compile_args = writer_compile_time_args}); + + std::vector receiver_compile_time_args = { + (std::uint32_t) reader_cb_addr, + (std::uint32_t) receiver_cb_size, + (std::uint32_t) num_mixed_df_layers, + }; + + auto receiver_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/kernels/receiver_l1.cpp", + l1_receiver_cores, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, + .noc = tt_metal::NOC::RISCV_1_default, + .compile_args = receiver_compile_time_args}); + + // reader rt + auto dram_reader_core_coord = dram_reader_core.ranges().begin()->start_coord; + log_info("dram_reader_core_coord: {}", dram_reader_core_coord); + auto dram_reader_core_coord_physical = device->worker_core_from_logical_core(dram_reader_core_coord); + uint32_t bank_id = 0; + uint32_t vc = bank_id & 0x1; + std::vector reader_rt_args = { + (std::uint32_t) bank_id, + (std::uint32_t) vc + }; + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + reader_rt_args.push_back(i%2 == 0 ? reader_page_size : next_layer_reader_page_size); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + reader_rt_args.push_back(i%2 == 0 ? reader_num_pages : next_layer_reader_num_pages); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + reader_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + reader_rt_args.push_back(i%2 == 0 ? block_num_tiles : next_layer_block_num_tiles); + } + tt_metal::SetRuntimeArgs(program, reader_kernel, dram_reader_core_coord, reader_rt_args); + + // writer rt + std::vector l1_receiver_core_coords; + for (auto l1_receiver_core_coord : *l1_receiver_cores.ranges().begin()) { + l1_receiver_core_coords.push_back(l1_receiver_core_coord); + } + std::vector writer_rt_args; + for (uint32_t i=0; i < num_receivers; ++i) { + auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]); + writer_rt_args.push_back(l1_receiver_core_coord_physical.x); + } + for (uint32_t i=0; i < num_receivers; ++i) { + auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]); + writer_rt_args.push_back(l1_receiver_core_coord_physical.y); + } + for (uint32_t i=0; i < num_receivers; ++i) { + writer_rt_args.push_back(pages_acked_semaphore_ids[i]); + } + for (uint32_t i=0; i < num_receivers; ++i) { + writer_rt_args.push_back(pages_sent_semaphore_ids[i]); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + writer_rt_args.push_back(i%2 == 0 ? writer_page_size : next_layer_writer_page_size); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + writer_rt_args.push_back(i%2 == 0 ? writer_num_pages : next_layer_writer_num_pages); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + writer_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + writer_rt_args.push_back(i%2 == 0 ? block_num_tiles : next_layer_block_num_tiles); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + writer_rt_args.push_back(i%2 == 0 ? single_tile_size : next_layer_single_tile_size); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + writer_rt_args.push_back(i%2 == 0 ? num_tile_rows_write : next_layer_num_tile_rows_write); + } + tt_metal::SetRuntimeArgs(program, writer_kernel, dram_reader_core_coord, writer_rt_args); + + // reciever rt + for (uint32_t i=0; i < num_receivers; ++i) { + std::vector receiver_rt_args = { + (std::uint32_t) vc & 0x3, + (std::uint32_t) dram_reader_core_coord_physical.x, + (std::uint32_t) dram_reader_core_coord_physical.y + }; + vc ++; + + receiver_rt_args.push_back(pages_acked_semaphore_ids[i]); + receiver_rt_args.push_back(pages_sent_semaphore_ids[i]); + + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + receiver_rt_args.push_back(i%2 == 0 ? single_tile_size : next_layer_single_tile_size); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + receiver_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks); + } + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + receiver_rt_args.push_back(i%2 == 0 ? receiver_block_num_tile : next_layer_receiver_block_num_tile); + } + + log_info("l1_receiver_core_coords: {}", l1_receiver_core_coords[i]); + + tt_metal::SetRuntimeArgs(program, receiver_kernel, l1_receiver_core_coords[i], receiver_rt_args); + } + + return {std::move(program), reader_kernel, reader_cb_addr}; +} + +float to_float(bfloat16 bfloat16_num) { + return bfloat16_num.to_float(); +} + +float pcc(const std::vector& x, const std::vector& y) { + if (x.size() != y.size()) { + throw std::invalid_argument("Vectors must be of the same length."); + } + + int n = x.size(); + float mean_x = 0, mean_y = 0; + for (int i = 0; i < n; ++i) { + mean_x += x[i]; + mean_y += y[i]; + } + mean_x /= n; + mean_y /= n; + + float numerator = 0, sum_sq_x = 0, sum_sq_y = 0; + for (int i = 0; i < n; ++i) { + float diff_x = x[i] - mean_x; + float diff_y = y[i] - mean_y; + numerator += diff_x * diff_y; + sum_sq_x += diff_x * diff_x; + sum_sq_y += diff_y * diff_y; + } + + float denominator = std::sqrt(sum_sq_x * sum_sq_y); + if (denominator == 0) { + return 0; + } + + return numerator / denominator; +} + +bool validation_bfp8_b( + tt::deprecated::Tensor input_tensor, + const tt::DataFormat &data_format, + uint32_t num_blocks, + uint32_t cb_num_blocks, + uint32_t kt, + uint32_t nt, + std::shared_ptr out_buffer +) { + bool pass = true; + std::vector golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros + std::vector result_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); + auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks; + + std::vector result_untilized; + std::vector result; + tt::tt_metal::detail::ReadFromBuffer(out_buffer, result); + auto result_bfp8 = unpack_bfp8_tiles_into_float_vec(result, true, false); + result_untilized = tt::test_utils::untilize(result_bfp8, kt*32 / num_blocks * cb_num_blocks, nt*32); + + const auto& values = input_tensor.get_values(); + + int index = 0; + for (int i = 0; i < kt * nt * 32 * 32; ++i) { + golden_vec[index] = float(values[i]); + index++; + + if (index == num_datums_per_cb) { + index = 0; + } + } + + for (int i=0; i= 0.9999; + if (!pass) { + log_error(LogTest, "validation single core failed"); + } + return pass; +} + + +bool validation_fp16( + tt::deprecated::Tensor input_tensor, + const tt::DataFormat &data_format, + uint32_t num_blocks, + uint32_t cb_num_blocks, + uint32_t kt, + uint32_t nt, + std::shared_ptr out_buffer +) { + bool pass = true; + std::vector golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros + std::vector result_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); + auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks; + + std::vector result; + tt::tt_metal::detail::ReadFromBuffer(out_buffer, result); + auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result); + auto result_flat_layout = convert_to_flat_layout(result_bfp16); + auto result_untilized = tt::test_utils::untilize(result_flat_layout, kt*32 / num_blocks * cb_num_blocks, nt*32); + + const auto& values = input_tensor.get_values(); + + int index = 0; + for (int i = 0; i < kt * nt * 32 * 32; ++i) { + golden_vec[index] = to_float(values[i]); + index++; + + if (index == num_datums_per_cb) { + index = 0; + } + } + + for (int i=0; i(result_untilized[i])); + } + + pass &= (golden_vec == result_vec); + if (!pass) { + log_error(LogTest, "validation single core failed"); + } + return pass; +} + +bool validation_mixed_df( + tt::deprecated::Tensor input_tensor_fp16, + tt::deprecated::Tensor input_tensor_fp8, + const tt::DataFormat &data_format, + uint32_t num_blocks, + uint32_t cb_num_blocks, + uint32_t kt, + uint32_t nt, + std::shared_ptr out_buffer, + uint32_t num_mixed_df_layers, + uint32_t num_receivers +) { + bool pass = true; + + std::vector result; + tt::tt_metal::detail::ReadFromBuffer(out_buffer, result); + + auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result); + auto result_untilized_fp16 = convert_to_flat_layout(result_bfp16); + + std::vector golden_vec(kt*32 / num_blocks * cb_num_blocks * nt*32); + std::vector result_vec_fp16(kt*32 / num_blocks * cb_num_blocks * nt*32); + + // compare with the result tilized with tilized + auto values_fp16 = tt::test_utils::tilize(input_tensor_fp16.get_values(), kt*32, nt*32); + + auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks / num_receivers; + int start_index = 0; + int fifo_size = kt*32 / num_blocks * cb_num_blocks * nt*32 * 2 / num_receivers; + int fifo_size_page_aligned, page_size, num_pages, layer_transfer_size, fifo_wr_ptr = 0; + for (int l = 0; l < num_mixed_df_layers; ++l) { + if (l % 2 == 0) { // fp16 + page_size = 2048; + } else { + page_size = 1088; + } + layer_transfer_size = page_size * kt * nt / num_receivers; + num_pages = fifo_size / page_size; + fifo_size_page_aligned = page_size * num_pages; + + bool fifo_wr_ptr_exceed_fifo_limit = fifo_wr_ptr > fifo_size_page_aligned; + uint32_t num_pages_till_fifo_limit = (fifo_size_page_aligned - fifo_wr_ptr) / page_size; + // start pointer addr of current layer + fifo_wr_ptr = fifo_wr_ptr_exceed_fifo_limit ? 0 : fifo_size_page_aligned - num_pages_till_fifo_limit * page_size; + // start index to read, fifo_wr_ptr / 2 because fp16 format + start_index = fifo_wr_ptr == fifo_size_page_aligned ? 0 : fifo_wr_ptr / 2; + // end pointer addr of current layer + fifo_wr_ptr = (fifo_wr_ptr + layer_transfer_size) % fifo_size_page_aligned; + } + + std::vector > values_fp16_split(num_receivers, std::vector(values_fp16.size() / num_receivers)); + + int index = 0; + for (int k = 0; k < kt; ++k) { + for (int n = 0; n < num_receivers; ++n) { + for (int i = 0; i < nt * 32 * 32 / num_receivers; ++i) { + values_fp16_split[n][i + k * nt * 32 * 32 / num_receivers] = to_float(values_fp16[index]); + index ++; + } + } + } + + std::vector > golden_vec_split(num_receivers, std::vector(golden_vec.size() / num_receivers)); + + for (int n = 0; n < num_receivers; ++n) { + index = start_index; + for (int i = 0; i < kt * nt * 32 * 32 / num_receivers; ++i) { + golden_vec_split[n][index] = values_fp16_split[n][i]; + index ++; + + if (index == num_datums_per_cb) { + index = 0; + } + } + } + + index = 0; + for (int k = 0; k < kt / num_blocks * cb_num_blocks; ++k) { + for (int n = 0; n < num_receivers; ++n) { + for (int i = 0; i < nt * 32 * 32 / num_receivers; ++i) { + golden_vec[index] = golden_vec_split[n][i + k * nt * 32 * 32 / num_receivers]; + index ++; + } + } + } + + for (int i=0; i(result_untilized_fp16[i])); + } + + // For debug purpose + // for (int i = 0; i < golden_vec.size(); ++i) { + // std::cout << golden_vec[i] << " "; + // if ((i+1) % 32 == 0) { + // std::cout << std::endl; + // } + // } + // std::cout << std::endl; + // std::cout << std::endl; + // for (int i = 0; i < result_vec_fp16.size(); ++i) { + // std::cout << result_vec_fp16[i] << " "; + // if ((i+1) % 32 == 0) { + // std::cout << std::endl; + // } + // } + + pass &= pcc(golden_vec, result_vec_fp16) == 1.0; + + if (!pass) { + log_error(LogTest, "validation single core failed"); + } + return pass; +} + +std::shared_ptr create_and_transfer_data_sharded_cb( + tt_metal::Device* device, + vector input_vec, + uint32_t ht, + uint32_t wt, + BufferType buffer_type, + tt::DataFormat data_format, + CoreRangeSet cores, + uint32_t num_receivers +) { + + uint32_t size_bytes; + uint32_t page_size_bytes; + if (data_format == tt::DataFormat::Bfp8_b) { + size_bytes = ht * wt * 1088; + page_size_bytes = 1088; + } else { + size_bytes = ht * tt::constants::TILE_HEIGHT * wt * tt::constants::TILE_WIDTH * 2; + page_size_bytes = tt::constants::TILE_HW * 2; + } + + ShardSpecBuffer shard_spec = ShardSpecBuffer( + cores, + {ht * tt::constants::TILE_HEIGHT, wt * tt::constants::TILE_WIDTH / num_receivers}, + ShardOrientation::ROW_MAJOR, + false, + {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH}, + {ht, wt}); + + log_info("cores: {}", cores); + log_info("size_bytes: {}", size_bytes); + log_info("page_size_bytes: {}", page_size_bytes); + + auto input_buffer = CreateBuffer(tt::tt_metal::ShardedBufferConfig{ + .device = device, + .size = size_bytes, + .page_size = page_size_bytes, + .buffer_type = buffer_type, + .buffer_layout = TensorMemoryLayout::WIDTH_SHARDED, + .shard_parameters = shard_spec}); + tt::tt_metal::detail::WriteToBuffer(input_buffer, input_vec); + + log_info("created sharded tensor"); + + return input_buffer; +} + +int main(int argc, char **argv) { + if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) { + log_error("Test not supported w/ slow dispatch, exiting"); + } + + bool pass = true; + bool use_device_profiler = false; + uint32_t df = 0; + std::vector dram_bandwidth; + uint32_t num_tests = 1; + uint32_t num_blocks = 8; + uint32_t cb_num_blocks = 8; + uint32_t cb_padding = 16; + uint32_t num_receivers = 1; + uint32_t num_mixed_df_layers = 1; + uint64_t k = 8192, n = 128; + + try { + //////////////////////////////////////////////////////////////////////////// + // Initial Runtime Args Parse + //////////////////////////////////////////////////////////////////////////// + std::vector input_args(argv, argv + argc); + try { + std::tie(k, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); + std::tie(n, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12*128); + std::tie(num_blocks, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8); + std::tie(cb_num_blocks, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--cb-num-blocks", 8); + std::tie(cb_padding, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--cb-padding", 16); + std::tie(num_tests, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-tests", 1); + std::tie(use_device_profiler, input_args) = + test_args::has_command_option_and_remaining_args(input_args, "--use-device-profiler"); + std::tie(df, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 0); + std::tie(num_receivers, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-receivers", 1); + std::tie(num_mixed_df_layers, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-mixed-df-layers", 1); + + + test_args::validate_remaining_args(input_args); + } catch (const std::exception &e) { + log_error(tt::LogTest, "Command line arguments found exception", e.what()); + TT_ASSERT(false); + } + + log_info("num_mixed_df_layers: {} ", num_mixed_df_layers); + log_info("num_receivers: {} ", num_receivers); + + TT_FATAL(num_mixed_df_layers % 2 == 1, "currently only support odd number of layers testing, due to issue with validatoin"); + if (num_mixed_df_layers > 1) { + TT_FATAL(df == 1, "must start with bfloat16 format for mix_df test"); + } + + if (use_device_profiler) { + #if !defined(TRACY_ENABLE) + log_error( + LogTest, + "Metal library and test code should be build with " + "profiler option using ./scripts/build_scripts/build_with_profiler_opt.sh"); + #endif + auto device_profiler = getenv("TT_METAL_DEVICE_PROFILER"); + TT_FATAL( + device_profiler, + "Before running the program, do one of the following in a shell: " + "either export the environment variable by executing export TT_METAL_DEVICE_PROFILER=1, " + "or run the program with TT_METAL_DEVICE_PROFILER=1 prefixed to the command"); + } + + //////////////////////////////////////////////////////////////////////////// + // Parameters Setup + //////////////////////////////////////////////////////////////////////////// + uint32_t num_banks = 1; + uint32_t input_size = 0; + tt::DataFormat tile_format = tt::DataFormat::Bfp8_b; + if (df == 0) { + input_size = k * n * 1088 / 1024; + tile_format = tt::DataFormat::Bfp8_b; + } else if (df == 1) { + input_size = k * n * 2; + tile_format = tt::DataFormat::Float16_b; + } else { + TT_THROW("Input data format {} is invalid. Please change.", df); + } + uint32_t output_size = input_size / num_blocks * cb_num_blocks; + uint32_t kt = k / 32; + uint32_t nt = n / 32; + uint32_t block_h = kt / num_blocks; + uint32_t block_w = nt; + uint32_t num_datums_per_tile = 32 * 32; + + uint32_t single_tile_size = tt_metal::detail::TileSize(tile_format); + + TT_FATAL(input_size % single_tile_size == 0, "input size is not aligned to tile size"); + //////////////////////////////////////////////////////////////////////////// + // Device Setup + //////////////////////////////////////////////////////////////////////////// + int device_id = 0; + tt_metal::Device *device = tt_metal::CreateDevice(device_id); + + CoreCoord dram_bank_coord = CoreCoord{0, 0}; + CoreCoord dram_reader_core_coord = CoreCoord{0, 0}; + CoreRange dram_reader_core_coord_range = CoreRange(dram_reader_core_coord); + CoreRangeSet dram_reader_core{std::set{CoreRange{dram_reader_core_coord}}}; + CoreRange l1_receiver_core_coord_range = CoreRange(CoreCoord{0, 0}); + if (device->arch() == tt::ARCH::GRAYSKULL) { + l1_receiver_core_coord_range = CoreRange{CoreCoord{0, 1}, CoreCoord{0, num_receivers}}; + } else { + l1_receiver_core_coord_range = CoreRange{CoreCoord{1, 0}, CoreCoord{num_receivers, 0}}; + } + CoreRangeSet l1_receiver_core{std::set{l1_receiver_core_coord_range}}; + + //////////////////////////////////////////////////////////////////////////// + // Input Setup + //////////////////////////////////////////////////////////////////////////// + std::vector > input_buffers(num_mixed_df_layers); + std::shared_ptr output_buffer; + auto input_shape = SHAPE{1, 1, k, n}; + tt::deprecated::Tensor tensor_fp16 = tt::deprecated::initialize_tensor(input_shape, tt::deprecated::Initialize::INCREMENT, 100, std::chrono::system_clock::now().time_since_epoch().count()); + tt::deprecated::Tensor tensor_fp8 = tt::deprecated::initialize_tensor(input_shape, tt::deprecated::Initialize::INCREMENT, 100, std::chrono::system_clock::now().time_since_epoch().count()); + if (tile_format == tt::DataFormat::Bfp8_b) { + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + if (i%2 == 0) { // even layers + auto input_vec_tilized = tt::test_utils::tilize(tensor_fp8.get_values(), k, n); + std::vector packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false); + input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks); + } else { // odd layers + auto input_vec_tilized = tt::test_utils::tilize(tensor_fp16.get_values(), k, n); + auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized); + vector packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout); + input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks); + } + } + + // output + vector outputs = create_constant_vector_of_bfp8(output_size, 0, true); + output_buffer = create_and_transfer_data_sharded_cb(device, outputs, kt / num_blocks * cb_num_blocks, nt, tt_metal::BufferType::L1, tt::DataFormat::Bfp8_b, l1_receiver_core, num_receivers); + + } else { + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { + if (i%2 == 0) { // even layers + auto input_vec_tilized = tt::test_utils::tilize(tensor_fp16.get_values(), k, n); + auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized); + vector packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout); + input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks); + } else { + auto input_vec_tilized = tt::test_utils::tilize(tensor_fp8.get_values(), k, n); + std::vector packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false); + input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks); + } + } + + // output + vector outputs = create_constant_vector_of_bfloat16(output_size, 0); + output_buffer = create_and_transfer_data_sharded_cb(device, outputs, kt / num_blocks * cb_num_blocks, nt, tt_metal::BufferType::L1, tt::DataFormat::Float16_b, l1_receiver_core, num_receivers); + } + + for (uint32_t i=0; i < num_mixed_df_layers; ++i) { + log_info("input_buffers addr: {}", input_buffers[i]->address()); + } + + //////////////////////////////////////////////////////////////////////////// + // Application Setup + //////////////////////////////////////////////////////////////////////////// + auto [program, kernel, output_cb_addr] = create_program(device, dram_reader_core, l1_receiver_core, single_tile_size, tile_format, k, n, num_blocks, cb_num_blocks, num_receivers, num_mixed_df_layers, cb_padding, input_buffers[0], output_buffer); + + //////////////////////////////////////////////////////////////////////////// + // Execution Application + //////////////////////////////////////////////////////////////////////////// + tt_metal::detail::CompileProgram(device, program); + + log_info(LogTest, "Num tests {}", num_tests); + for (uint32_t i = 0; i < num_tests; ++i) { + EnqueueProgram(device->command_queue(), program, false); + Finish(device->command_queue()); + tt_metal::DumpDeviceProfileResults(device, program); + } + + //////////////////////////////////////////////////////////////////////////// + // Validation & Teardown + //////////////////////////////////////////////////////////////////////////// + if (num_mixed_df_layers == 1) { + if (tile_format == tt::DataFormat::Bfp8_b) { + pass = validation_bfp8_b( + tensor_fp8, + tile_format, + num_blocks, + cb_num_blocks, + kt, + nt, + output_buffer); + } else { + pass = validation_fp16( + tensor_fp16, + tile_format, + num_blocks, + cb_num_blocks, + kt, + nt, + output_buffer); + } + } else { + pass = validation_mixed_df( + tensor_fp16, + tensor_fp8, + tile_format, + num_blocks, + cb_num_blocks, + kt, + nt, + output_buffer, + num_mixed_df_layers, + num_receivers); + } + + pass &= tt_metal::CloseDevice(device); + } catch (const std::exception &e) { + pass = false; + log_error(LogTest, "{}", e.what()); + log_error(LogTest, "System error message: {}", std::strerror(errno)); + } + + if (pass) { + log_info(LogTest, "Test Passed"); + } else { + log_error(LogTest, "Test Failed"); + } + + return 0; +} diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp index e42ab99525a..17509788f2e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/kernels/reader_dram.cpp @@ -49,7 +49,7 @@ void kernel_main() { constexpr uint32_t cb_id = 0; - uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, bank_id, vc); + uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, page_size, bank_id, vc); uint32_t l1_read_addr = 0; constexpr uint32_t total_num_blocks_in_buffer = 3; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp index 48c659c54ce..479dec38ec1 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/reader_dram.cpp @@ -51,7 +51,7 @@ void kernel_main() { constexpr uint32_t cb_id = 0; - uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, bank_id, vc); + uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state(input_addr, page_size, bank_id, vc); uint32_t src_read_addr = 0; #ifdef ARCH_GRAYSKULL diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt index 94875c6114f..5d839ed65ba 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt @@ -38,6 +38,7 @@ set(PERF_MICROBENCH_TESTS_SRCS 7_kernel_launch/test_kernel_launch.cpp 8_dram_adjacent_core_read/test_dram_read.cpp 9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp + 10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp ) foreach (TEST_SRC ${PERF_MICROBENCH_TESTS_SRCS}) diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 7c771096094..56ba958d3cf 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -1621,9 +1621,9 @@ inline void RISC_POST_HEARTBEAT(uint32_t &heartbeat) { FORCE_INLINE uint32_t min(uint32_t a, uint32_t b) { return (a < b) ? a: b; } -template +template FORCE_INLINE -uint32_t noc_async_read_tile_dram_sharded_set_state(uint32_t bank_base_address, uint32_t bank_id = 0, const uint32_t vc = 0, uint8_t noc = noc_index) { +uint32_t noc_async_read_tile_dram_sharded_set_state(uint32_t bank_base_address, uint32_t page_size, uint32_t bank_id = 0, const uint32_t vc = 0, uint8_t noc = noc_index) { uint32_t src_addr_; uint32_t src_noc_xy; diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp index 0a7f90d5b9d..01ebc270bbc 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_dram_sharded.cpp @@ -61,7 +61,7 @@ void kernel_main() { constexpr DataFormat in1_data_format = get_dataformat(cb_id_in1); uint32_t in1_base_addr = - noc_async_read_tile_dram_sharded_set_state(in1_tensor_addr, dram_bank_id, vc); + noc_async_read_tile_dram_sharded_set_state(in1_tensor_addr, in1_page_size, dram_bank_id, vc); #ifdef ARCH_GRAYSKULL for (uint32_t block = 0; block < num_blocks; ++block) { @@ -131,7 +131,7 @@ void kernel_main() { uint32_t l1_read_addr_in3 = 0; uint32_t in3_base_addr = - noc_async_read_tile_dram_sharded_set_state(in3_tensor_addr, dram_bank_id, vc); + noc_async_read_tile_dram_sharded_set_state(in3_tensor_addr, in3_page_size, dram_bank_id, vc); for (uint32_t h = 0; h < in3_num_pages; ++h) { noc_async_read_tile_dram_sharded_with_state(in3_base_addr, l1_read_addr_in3, l1_write_addr_in3); diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp index 94fced2de44..73664b05953 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp @@ -207,8 +207,8 @@ void kernel_main() { uint32_t next_bank_id_and_dram_stride_index = 0; for (uint32_t i = 0; i < num_dram_shards_to_read; ++i) { - uint32_t in1_base_addr = noc_async_read_tile_dram_sharded_set_state( - in1_tensor_addr, current_dram_bank_id[next_bank_id_and_dram_stride_index], vc); + uint32_t in1_base_addr = noc_async_read_tile_dram_sharded_set_state( + in1_tensor_addr, in1_single_tile_size_bytes, current_dram_bank_id[next_bank_id_and_dram_stride_index], vc); if (i == 0) { in1_base_addr += dram_tensor_start_offset; @@ -313,8 +313,8 @@ void kernel_main() { uint32_t next_bank_id_and_dram_stride_index = 0; for (uint32_t i = 0; i < num_dram_shards_to_read; ++i) { - uint32_t in3_base_addr = noc_async_read_tile_dram_sharded_set_state( - in3_tensor_addr, current_dram_bank_id[next_bank_id_and_dram_stride_index], vc); + uint32_t in3_base_addr = noc_async_read_tile_dram_sharded_set_state( + in3_tensor_addr, bias_single_tile_size_bytes, current_dram_bank_id[next_bank_id_and_dram_stride_index], vc); if (i == 0) { in3_base_addr += dram_tensor_start_offset;