Skip to content

Commit

Permalink
Yugao/cb (#14043)
Browse files Browse the repository at this point in the history
* #0: add single receiver core remote cb impl

* #0: add multiple receivers remote cb sync

* #0: add L1 gaps between diff data format layers

* #0: multi-layer multi-receiver remote CB sync

* #0: add u-bench test to WH flow

* #0: add pages sent/acked in setup_page_size when increment the rd/wr ptr
  • Loading branch information
yugaoTT authored Oct 22, 2024
1 parent 9bb0866 commit 044a345
Show file tree
Hide file tree
Showing 12 changed files with 1,569 additions and 10 deletions.
1 change: 1 addition & 0 deletions tests/scripts/run_moreh_microbenchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ run_profiling_test() {
if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_matmul_single_core_sharded -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_12_core -k $ARCH_NAME
pytest --capture=tee-sys $TT_METAL_HOME/tests/scripts/test_moreh_microbenchmark.py::test_dram_read_remote_cb_sync -k $ARCH_NAME
fi
# bypass wh_b0 for now until we can move FD cores to last col
if [[ "$ARCH_NAME" != "wormhole_b0" ]]; then
Expand Down
85 changes: 85 additions & 0 deletions tests/scripts/test_moreh_microbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,33 @@ def run_dram_read_l1_write_cmd(k, n, num_blocks, df, num_banks, bank_start_id):
run_moreh_single_test("DRAM BW test multi-core", command)


def run_dram_read_remote_cb_sync_cmd(
k, n, num_blocks, cb_num_blocks, cb_padding, df, num_receivers, num_mixed_df_layers
):
command = (
"TT_METAL_DEVICE_PROFILER=1 ./build/test/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb "
+ " --k "
+ str(k)
+ " --n "
+ str(n)
+ " --num-blocks "
+ str(num_blocks)
+ " --cb-num-blocks "
+ str(cb_num_blocks)
+ " --cb-padding "
+ str(cb_padding)
+ " --num-tests "
+ str(1)
+ " --data-type "
+ str(df)
+ " --num-receivers "
+ str(num_receivers)
+ " --num-mixed-df-layers "
+ str(num_mixed_df_layers)
)
run_moreh_single_test("DRAM read remote CB sync single-core ", command)


# noc
def test_noc_local(r=9, c=12, nt=256, cb=1):
command = (
Expand Down Expand Up @@ -739,6 +766,64 @@ def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, dat
assert bw_bound <= throughput


@pytest.mark.parametrize(
"arch, freq, test_vector, num_tests, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers",
[
# single layer single receiver test
("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 5, 256, 1, 1, 1),
# single layer multi receiver test
("wormhole_b0", 1000, np.array([32768, 128]), 1, 64, 3, 256, 1, 2, 1),
# multi layer multi receiver test
("wormhole_b0", 1000, np.array([32768, 256]), 1, 64, 5, 256, 1, 4, 15),
],
)
def test_dram_read_remote_cb_sync(
arch, freq, test_vector, num_tests, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers
):
data = []
cycle_list = []
time_list = []
throughput_list = []
for _ in range(num_tests):
k = int(test_vector[0])
n = int(test_vector[1])
input_size = 0
if data_format == 0:
input_size += k * n * 1088 // 1024
elif data_format == 1:
input_size += k * n * 2048 // 1024
for i in range(num_mixed_df_layers - 1):
if i % 2 == 0:
input_size += k * n * 1088 // 1024
else:
input_size += k * n * 2048 // 1024
run_dram_read_remote_cb_sync_cmd(
k, n, nblock, cb_nblock, cb_padding, data_format, num_receivers, num_mixed_df_layers
)
cycle = profile_results_kernel_duration()
time = cycle / freq / 1000.0 / 1000.0
throughput = input_size / cycle * freq / 1000.0
cycle_list.append(cycle)
time_list.append(time)
throughput_list.append(throughput)
cycle = sum(cycle_list) / len(cycle_list)
time = sum(time_list) / len(time_list)
throughput = sum(throughput_list) / len(throughput_list)
logger.info("DRAM read cycle: " + str(cycle))
logger.info("DRAM read time: " + str(time))
logger.info("DRAM read throughput: " + str(throughput))
data.append([throughput])
# check within range
dev_freq = get_device_freq()
if arch == "grayskull":
bw_bound = 100.0
elif arch == "wormhole_b0":
bw_bound = 22.0
elif arch == "blackhole":
bw_bound = 340.0
assert bw_bound <= throughput


@pytest.mark.parametrize(
"arch, freq, r, c, test_vector_global, test_vector_local",
[
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>

#include "dataflow_api.h"
#include "ttnn/cpp/ttnn/operations/ccl/kernel_common/worker_sync_utils.hpp"

#include "debug/dprint.h"

template <uint32_t bank_base_address, uint32_t page_size, bool use_vc>
FORCE_INLINE
void noc_async_read_tile_dram_sharded(uint32_t src_addr, uint32_t dest_addr, uint32_t bank_id = 0, const uint32_t vc = 0) {
uint32_t src_addr_;
uint32_t src_noc_xy;

src_addr_ = src_addr + bank_base_address;
src_addr_ += bank_to_dram_offset[bank_id];
src_noc_xy = dram_bank_to_noc_xy[noc_index][bank_id];

WAYPOINT("NRTW");
DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc_index, get_noc_addr_helper(src_noc_xy, src_addr_), dest_addr, page_size);
while (!noc_cmd_buf_ready(noc_index, NCRISC_RD_CMD_BUF));
WAYPOINT("NRTD");

if constexpr(use_vc) {
uint32_t noc_rd_cmd_field = NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc);
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field);
}

NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_RET_ADDR_LO, dest_addr);
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_LO, src_addr_); // (uint32_t)src_addr
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE, page_size); // len_bytes
NOC_CMD_BUF_WRITE_REG(noc_index, NCRISC_RD_CMD_BUF, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ);
noc_reads_num_issued[noc_index] += 1;
}

void kernel_main() {
constexpr uint32_t input_addr = get_compile_time_arg_val(0);
constexpr uint32_t input_start_tile_id = get_compile_time_arg_val(1);
constexpr uint32_t noc = get_compile_time_arg_val(2);
constexpr uint32_t num_layers = get_compile_time_arg_val(3);

uint32_t rt_args_idx = 0;
const uint32_t bank_id = get_arg_val<uint32_t>(rt_args_idx++);
const uint32_t vc = get_arg_val<uint32_t>(rt_args_idx++);
tt_l1_ptr uint32_t* page_size = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
tt_l1_ptr uint32_t* num_pages = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
tt_l1_ptr uint32_t* num_blocks = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));
tt_l1_ptr uint32_t* block_num_tiles = (tt_l1_ptr uint32_t*)(get_arg_addr(increment_arg_idx(rt_args_idx, num_layers)));

constexpr uint32_t cb_id = 0;
constexpr uint32_t total_num_blocks_in_buffer = 3;

uint32_t block_size_bytes = num_pages[0] * page_size[0];
uint32_t l1_buffer_start_addr = get_write_ptr(cb_id);
uint32_t l1_buffer_end_addr = get_write_ptr(cb_id) + block_size_bytes * total_num_blocks_in_buffer;

uint32_t src_read_addr = 0;
uint32_t src_read_addr_offset_bytes = 0;

for (uint32_t l = 0; l < num_layers; ++l) {
uint32_t curr_page_size = page_size[l];
uint32_t curr_num_pages = num_pages[l];
uint32_t curr_num_blocks = num_blocks[l];
uint32_t curr_block_num_tiles = block_num_tiles[l];

uint32_t curr_block_size_bytes = curr_num_pages * curr_page_size;
uint32_t curr_layer_size_bytes = curr_num_blocks * curr_block_size_bytes;

uint32_t src_base_addr = noc_async_read_tile_dram_sharded_set_state<true>(input_addr, curr_page_size, bank_id, vc);
src_read_addr = src_read_addr_offset_bytes;

// For debug purpose, use trivial DRAM read method
// for (uint32_t block = 0; block < curr_num_blocks; ++block) {
// // Operand 1
// cb_reserve_back(cb_id, curr_block_num_tiles);
// auto l1_write_addr = get_write_ptr(cb_id);

// for (uint32_t h = 0; h < curr_num_pages; ++h) {
// noc_async_read_tile_dram_sharded_with_state(src_base_addr, src_read_addr, l1_write_addr);
// src_read_addr += curr_page_size;
// l1_write_addr += curr_page_size;
// }

// noc_async_read_barrier();

// cb_push_back(cb_id, curr_block_num_tiles);
// }

uint32_t num_free_blocks_in_buffer = total_num_blocks_in_buffer;
uint32_t curr_block_trid = 1;
uint32_t block_trid_to_wait = 1;

cb_reserve_back(cb_id, curr_block_num_tiles);
uint32_t l1_write_addr_offset = 0;
uint32_t l1_write_addr_start = get_write_ptr(cb_id);
if (l1_write_addr_start >= l1_buffer_end_addr) {
l1_write_addr_start = l1_buffer_start_addr;
}
uint32_t l1_write_addr = l1_write_addr_start;
for (uint32_t block = 0; block < curr_num_blocks; ++block) {
noc_async_read_tile_dram_sharded_set_trid(curr_block_trid);

uint32_t temp_l1_write_addr = l1_write_addr;
for (uint32_t h = 0; h < curr_num_pages; ++h) {
noc_async_read_tile_dram_sharded_with_state_with_trid(
src_base_addr, src_read_addr, temp_l1_write_addr, curr_block_trid);
src_read_addr += curr_page_size;
temp_l1_write_addr += curr_page_size;
}

if (num_free_blocks_in_buffer == 2) {
noc_async_read_barrier_with_trid(block_trid_to_wait);
cb_push_back(cb_id, curr_block_num_tiles);
// wait for next block trid
block_trid_to_wait = block_trid_to_wait == 3 ? 1 : (block_trid_to_wait + 1);
// reserve for next block
cb_reserve_back(cb_id, curr_block_num_tiles * 2);
} else {
num_free_blocks_in_buffer -= 1;
}

if (curr_block_trid == total_num_blocks_in_buffer) {
curr_block_trid = 1;
} else {
curr_block_trid += 1;
}

l1_write_addr += block_size_bytes;
if (l1_write_addr >= l1_buffer_end_addr) {
l1_write_addr = l1_buffer_start_addr;
}
}
// last block to wait
noc_async_read_barrier_with_trid(block_trid_to_wait);
cb_push_back(cb_id, curr_block_num_tiles);

src_read_addr_offset_bytes += curr_layer_size_bytes;

}

}
Loading

0 comments on commit 044a345

Please sign in to comment.