From 83d596e07f002cf8e2b92d15ea45d945a9ec035b Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Wed, 10 Jul 2024 21:47:29 +0000 Subject: [PATCH 1/2] #10107: Fix hangs w/ launch_msg size >32bytes Change default MMIO TLB ordering from posted to strict (will have a negative perf impact) Split launch message into kernel_config and go Write these separately from host w/ an sfence between --- .../test_kernels/dataflow/dram_copy.cpp | 3 +- .../test_kernels/misc/watcher_asserts.cpp | 4 +- .../unit_tests_common/watcher/test_assert.cpp | 2 +- tt_metal/hw/firmware/src/brisc.cc | 29 ++++---- tt_metal/hw/firmware/src/erisc.cc | 10 +-- tt_metal/hw/firmware/src/erisck.cc | 8 ++- tt_metal/hw/firmware/src/idle_erisc.cc | 24 ++++--- tt_metal/hw/firmware/src/ncrisc.cc | 10 +-- tt_metal/hw/firmware/src/trisc.cc | 10 +-- tt_metal/hw/inc/debug/assert.h | 2 +- tt_metal/hw/inc/debug/sanitize_noc.h | 2 +- tt_metal/hw/inc/dev_msgs.h | 15 +++- tt_metal/impl/debug/watcher_server.cpp | 70 +++++++++---------- tt_metal/impl/device/device.cpp | 12 ++-- tt_metal/impl/dispatch/command_queue.cpp | 16 ++--- tt_metal/impl/dispatch/kernels/cq_helpers.hpp | 2 +- .../impl/dispatch/kernels/eth_tunneler.cpp | 2 +- tt_metal/impl/program/program.cpp | 26 +++---- tt_metal/llrt/llrt.cpp | 23 ++++-- tt_metal/llrt/tlb_config.cpp | 8 ++- tt_metal/tt_metal.cpp | 4 +- 21 files changed, 162 insertions(+), 120 deletions(-) diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp index e9d945ec3c5..0c8f57140ab 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp @@ -34,7 +34,8 @@ void kernel_main() { #else tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); #endif - uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); + uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x), + NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31, false); #endif diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp index 276d637fcbe..391ee669d3a 100644 --- a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp @@ -40,7 +40,9 @@ void MAIN { #else tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); #endif - uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); + uint64_t dispatch_addr = + NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x), + NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/); } #else diff --git a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp index 1eebbfdfd91..6511334b618 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/watcher/test_assert.cpp @@ -154,7 +154,7 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty // We should be able to find the expected watcher error in the log as well, // expected error message depends on the risc we're running on. string kernel = "tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp"; - int line_num = 55; + int line_num = 57; string expected = fmt::format( "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} tripped an assert on line {}. Current kernel: {}.", diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 221f2175bb7..41134ccaf90 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -273,7 +273,7 @@ inline void deassert_ncrisc_trisc() { // Below sets ncrisc to go so we can wait until it is cleared on first iteration mailboxes->slave_sync.all = RUN_SYNC_MSG_ALL_SLAVES_DONE; - uint16_t fw_size16 = mailboxes->launch.ncrisc_kernel_size16; + uint16_t fw_size16 = mailboxes->launch.kernel_config.ncrisc_kernel_size16; ncrisc_kernel_start_offset16 = fw_size16; // Copies from L1 to IRAM on chips where NCRISC has IRAM @@ -354,42 +354,44 @@ int main() { // Wait for ncrisc to halt wait_for_ncrisc_to_halt(); - mailboxes->launch.run = RUN_MSG_DONE; + mailboxes->launch.go.run = RUN_MSG_DONE; while (1) { init_sync_registers(); reset_ncrisc_with_iram(); DEBUG_STATUS("GW"); - while (mailboxes->launch.run != RUN_MSG_GO); + while (mailboxes->launch.go.run != RUN_MSG_GO); DEBUG_STATUS("GD"); { DeviceZoneScopedMainN("BRISC-FW"); // Copies from L1 to IRAM on chips where NCRISC has IRAM - l1_to_ncrisc_iram_copy(mailboxes->launch.ncrisc_kernel_size16, ncrisc_kernel_start_offset16); + l1_to_ncrisc_iram_copy(mailboxes->launch.kernel_config.ncrisc_kernel_size16, ncrisc_kernel_start_offset16); // Invalidate the i$ now the kernels have loaded and before running volatile tt_reg_ptr uint32_t* cfg_regs = core.cfg_regs_base(0); cfg_regs[RISCV_IC_INVALIDATE_InvalidateAll_ADDR32] = RISCV_IC_BRISC_MASK | RISCV_IC_TRISC_ALL_MASK | RISCV_IC_NCRISC_MASK; - enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)mailboxes->launch.enables; + enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)mailboxes->launch.kernel_config.enables; run_triscs(enables); - noc_index = mailboxes->launch.brisc_noc_id; + noc_index = mailboxes->launch.kernel_config.brisc_noc_id; setup_cb_read_write_interfaces(0, num_cbs_to_early_init, true, true); finish_ncrisc_copy_and_run(enables); // Run the BRISC kernel DEBUG_STATUS("R"); - uint32_t kernel_config_base = mailboxes->launch.kernel_config_base; - rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_DM0].rta_offset); - crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_DM0].crta_offset); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM0].rta_offset); + crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM0].crta_offset); if (enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM0) { - setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.max_cb_index, true, true); + setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.kernel_config.max_cb_index, true, true); kernel_init(); } else { // This was not initialized in kernel_init @@ -399,12 +401,13 @@ int main() { wait_ncrisc_trisc(); - mailboxes->launch.run = RUN_MSG_DONE; + mailboxes->launch.go.run = RUN_MSG_DONE; // Notify dispatcher core that it has completed - if (mailboxes->launch.mode == DISPATCH_MODE_DEV) { + if (mailboxes->launch.kernel_config.mode == DISPATCH_MODE_DEV) { uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); + NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x), + NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR); DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4); noc_fast_atomic_increment( noc_index, diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index 071fd990f53..4523be27cfa 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -77,12 +77,14 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) { while (routing_info->routing_enabled) { // FD: assume that no more host -> remote writes are pending - if (mailboxes->launch.run == RUN_MSG_GO) { + if (mailboxes->launch.go.run == RUN_MSG_GO) { DeviceZoneScopedMainN("ERISC-FW"); DEBUG_STATUS("R"); - uint32_t kernel_config_base = mailboxes->launch.kernel_config_base; - rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset); - crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_ETH_DM0].crta_offset); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset); + crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].crta_offset); kernel_init(); } else { diff --git a/tt_metal/hw/firmware/src/erisck.cc b/tt_metal/hw/firmware/src/erisck.cc index 76fdb164505..4458179c7d0 100644 --- a/tt_metal/hw/firmware/src/erisck.cc +++ b/tt_metal/hw/firmware/src/erisck.cc @@ -32,9 +32,11 @@ void __attribute__((section("erisc_l1_code"))) kernel_launch() { rtos_context_switch_ptr = (void (*)())RtosTable[0]; kernel_main(); - mailboxes->launch.run = RUN_MSG_DONE; - uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); - if (routing_info->routing_enabled and mailboxes->launch.mode == DISPATCH_MODE_DEV) { + mailboxes->launch.go.run = RUN_MSG_DONE; + uint64_t dispatch_addr = + NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x), + NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR); + if (routing_info->routing_enabled and mailboxes->launch.kernel_config.mode == DISPATCH_MODE_DEV) { internal_::notify_dispatch_core_done(dispatch_addr); } } diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index 9a191bf7e4d..bf7b746cb71 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -92,7 +92,7 @@ int main() { //device_setup(); noc_init(); - mailboxes->launch.run = RUN_MSG_DONE; + mailboxes->launch.go.run = RUN_MSG_DONE; // Cleanup profiler buffer incase we never get the go message while (1) { @@ -100,7 +100,7 @@ int main() { init_sync_registers(); // Wait... DEBUG_STATUS("GW"); - while (mailboxes->launch.run != RUN_MSG_GO) + while (mailboxes->launch.go.run != RUN_MSG_GO) { RISC_POST_HEARTBEAT(heartbeat); }; @@ -109,7 +109,7 @@ int main() { { DeviceZoneScopedMainN("ERISC-IDLE-FW"); - noc_index = mailboxes->launch.brisc_noc_id; + noc_index = mailboxes->launch.kernel_config.brisc_noc_id; //UC FIXME: do i need this? setup_cb_read_write_interfaces(0, num_cbs_to_early_init, true, true); @@ -118,10 +118,12 @@ int main() { DEBUG_STATUS("R"); //if (mailboxes->launch.enable_brisc) { //UC FIXME: do i need this? - setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.max_cb_index, true, true); - uint32_t kernel_config_base = mailboxes->launch.kernel_config_base; - rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset); - crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_ETH_DM0].crta_offset); + setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.kernel_config.max_cb_index, true, true); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset); + crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].crta_offset); kernel_init(); //} else { @@ -130,12 +132,14 @@ int main() { //} DEBUG_STATUS("D"); - mailboxes->launch.run = RUN_MSG_DONE; + mailboxes->launch.go.run = RUN_MSG_DONE; // Notify dispatcher core that it has completed - if (mailboxes->launch.mode == DISPATCH_MODE_DEV) { - uint64_t dispatch_addr = NOC_XY_ADDR(NOC_X(mailboxes->launch.dispatch_core_x), NOC_Y(mailboxes->launch.dispatch_core_y), DISPATCH_MESSAGE_ADDR); + if (mailboxes->launch.kernel_config.mode == DISPATCH_MODE_DEV) { + uint64_t dispatch_addr = + NOC_XY_ADDR(NOC_X(mailboxes->launch.kernel_config.dispatch_core_x), + NOC_Y(mailboxes->launch.kernel_config.dispatch_core_y), DISPATCH_MESSAGE_ADDR); DEBUG_SANITIZE_NOC_ADDR(dispatch_addr, 4); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/); } diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index 03730f03d3c..3e8d66ab4d1 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -94,11 +94,13 @@ int main(int argc, char *argv[]) { notify_brisc_and_wait(); DeviceZoneScopedMainN("NCRISC-FW"); - setup_cb_read_write_interfaces(0, mailboxes->launch.max_cb_index, true, true); + setup_cb_read_write_interfaces(0, mailboxes->launch.kernel_config.max_cb_index, true, true); - uint32_t kernel_config_base = mailboxes->launch.kernel_config_base; - rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_DM1].rta_offset); - crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_DM1].crta_offset); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM1].rta_offset); + crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM1].crta_offset); DEBUG_STATUS("R"); kernel_init(); diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index f1be7f4566a..83cae8a7398 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -106,12 +106,14 @@ int main(int argc, char *argv[]) { DeviceZoneScopedMainN("TRISC-FW"); #if !defined(UCK_CHLKC_MATH) - setup_cb_read_write_interfaces(0, mailboxes->launch.max_cb_index, cb_init_read, cb_init_write); + setup_cb_read_write_interfaces(0, mailboxes->launch.kernel_config.max_cb_index, cb_init_read, cb_init_write); #endif - uint32_t kernel_config_base = mailboxes->launch.kernel_config_base; - rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].rta_offset); - crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].crta_offset); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].rta_offset); + crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].crta_offset); DEBUG_STATUS("R"); kernel_init(); diff --git a/tt_metal/hw/inc/debug/assert.h b/tt_metal/hw/inc/debug/assert.h index e8eaf04102e..109c0263ad4 100644 --- a/tt_metal/hw/inc/debug/assert.h +++ b/tt_metal/hw/inc/debug/assert.h @@ -19,7 +19,7 @@ void assert_and_hang(uint32_t line_num) { // Update launch msg to show that we've exited. tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch); - launch_msg->run = RUN_MSG_DONE; + launch_msg->go.run = RUN_MSG_DONE; // Hang, or in the case of erisc, early exit. #if defined(COMPILE_FOR_ERISC) diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h index 35a349b3632..eeb260feffb 100644 --- a/tt_metal/hw/inc/debug/sanitize_noc.h +++ b/tt_metal/hw/inc/debug/sanitize_noc.h @@ -83,7 +83,7 @@ inline void debug_sanitize_post_noc_addr_and_hang( // Update launch msg to show that we've exited. tt_l1_ptr launch_msg_t *launch_msg = GET_MAILBOX_ADDRESS_DEV(launch); - launch_msg->run = RUN_MSG_DONE; + launch_msg->go.run = RUN_MSG_DONE; #if defined(COMPILE_FOR_ERISC) // For erisc, we can't hang the kernel/fw, because the core doesn't get restarted when a new diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index 6db9e63964e..c6a6cde74a2 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -76,7 +76,7 @@ struct dyn_mem_map_t { volatile uint16_t crta_offset; }; -struct launch_msg_t { // must be cacheline aligned +struct kernel_config_msg_t { volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX]; volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units @@ -91,7 +91,16 @@ struct launch_msg_t { // must be cacheline aligned volatile uint8_t dispatch_core_x; volatile uint8_t dispatch_core_y; volatile uint8_t exit_erisc_kernel; - volatile uint8_t run; // must be in last cacheline of this msg + volatile uint8_t pad1; +} __attribute__((packed)); + +struct go_msg_t { + volatile uint32_t run; // must be in last cacheline of this msg +} __attribute__((packed)); + +struct launch_msg_t { // must be cacheline aligned + kernel_config_msg_t kernel_config; + go_msg_t go; } __attribute__((packed)); struct slave_sync_msg_t { @@ -194,7 +203,7 @@ struct mailboxes_t { struct debug_insert_delays_msg_t debug_insert_delays; }; -static_assert(sizeof(launch_msg_t) % sizeof(uint32_t) == 0); +static_assert(sizeof(kernel_config_msg_t) % sizeof(uint32_t) == 0); #ifndef TENSIX_FIRMWARE // Validate assumptions on mailbox layout on host compile diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp index c9bba72e30d..a70cae6f3c6 100644 --- a/tt_metal/impl/debug/watcher_server.cpp +++ b/tt_metal/impl/debug/watcher_server.cpp @@ -126,9 +126,9 @@ void create_kernel_file() { static void log_running_kernels(const launch_msg_t *launch_msg) { log_info("While running kernels:"); - log_info(" brisc : {}", kernel_names[launch_msg->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0]]); - log_info(" ncrisc: {}", kernel_names[launch_msg->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1]]); - log_info(" triscs: {}", kernel_names[launch_msg->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE]]); + log_info(" brisc : {}", kernel_names[launch_msg->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0]]); + log_info(" ncrisc: {}", kernel_names[launch_msg->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1]]); + log_info(" triscs: {}", kernel_names[launch_msg->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE]]); } static void dump_l1_status(FILE *f, Device *device, CoreCoord core, const launch_msg_t *launch_msg) { @@ -158,13 +158,13 @@ static const char *get_riscv_name(CoreCoord core, uint32_t type) { static string get_kernel_name(CoreCoord core, const launch_msg_t *launch_msg, uint32_t type) { switch (type) { - case DebugBrisc: return kernel_names[launch_msg->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0]]; + case DebugBrisc: return kernel_names[launch_msg->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0]]; case DebugErisc: - case DebugIErisc: return kernel_names[launch_msg->watcher_kernel_ids[DISPATCH_CLASS_ETH_DM0]]; - case DebugNCrisc: return kernel_names[launch_msg->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1]]; + case DebugIErisc: return kernel_names[launch_msg->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_ETH_DM0]]; + case DebugNCrisc: return kernel_names[launch_msg->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1]]; case DebugTrisc0: case DebugTrisc1: - case DebugTrisc2: return kernel_names[launch_msg->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE]]; + case DebugTrisc2: return kernel_names[launch_msg->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE]]; default: log_running_kernels(launch_msg); TT_THROW("Watcher data corrupted, unexpected riscv type on core {}: {}", core.str(), type); @@ -460,57 +460,57 @@ static void dump_run_mailboxes( FILE *f, CoreCoord core, const launch_msg_t *launch_msg, const slave_sync_msg_t *slave_sync) { fprintf(f, "rmsg:"); - if (launch_msg->mode == DISPATCH_MODE_DEV) { + if (launch_msg->kernel_config.mode == DISPATCH_MODE_DEV) { fprintf(f, "D"); - } else if (launch_msg->mode == DISPATCH_MODE_HOST) { + } else if (launch_msg->kernel_config.mode == DISPATCH_MODE_HOST) { fprintf(f, "H"); } else { log_running_kernels(launch_msg); TT_THROW( "Watcher data corruption, unexpected launch mode on core {}: {} (expected {} or {})", core.str(), - launch_msg->mode, + launch_msg->kernel_config.mode, DISPATCH_MODE_DEV, DISPATCH_MODE_HOST); } - if (launch_msg->brisc_noc_id == 0 || launch_msg->brisc_noc_id == 1) { - fprintf(f, "%d", launch_msg->brisc_noc_id); + if (launch_msg->kernel_config.brisc_noc_id == 0 || launch_msg->kernel_config.brisc_noc_id == 1) { + fprintf(f, "%d", launch_msg->kernel_config.brisc_noc_id); } else { log_running_kernels(launch_msg); TT_THROW( "Watcher data corruption, unexpected brisc noc_id on core {}: {} (expected 0 or 1)", core.str(), - launch_msg->brisc_noc_id); + launch_msg->kernel_config.brisc_noc_id); } - dump_run_state(f, core, launch_msg, launch_msg->run); + dump_run_state(f, core, launch_msg, launch_msg->go.run); fprintf(f, "|"); - if (launch_msg->enables & ~(DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM0 | - DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM1 | - DISPATCH_CLASS_MASK_TENSIX_ENABLE_COMPUTE)) { + if (launch_msg->kernel_config.enables & ~(DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM0 | + DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM1 | + DISPATCH_CLASS_MASK_TENSIX_ENABLE_COMPUTE)) { log_running_kernels(launch_msg); TT_THROW( "Watcher data corruption, unexpected kernel enable on core {}: {} (expected only low bits set)", core.str(), - launch_msg->enables); + launch_msg->kernel_config.enables); } - if (launch_msg->enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM0) { + if (launch_msg->kernel_config.enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM0) { fprintf(f, "B"); } else { fprintf(f, "b"); } - if (launch_msg->enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM1) { + if (launch_msg->kernel_config.enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM1) { fprintf(f, "N"); } else { fprintf(f, "n"); } - if (launch_msg->enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_COMPUTE) { + if (launch_msg->kernel_config.enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_COMPUTE) { fprintf(f, "T"); } else { fprintf(f, "t"); @@ -557,35 +557,35 @@ static void dump_sync_regs(FILE *f, Device *device, CoreCoord core) { static void validate_kernel_ids( FILE *f, std::map &used_kernel_names, chip_id_t device_id, CoreCoord core, const launch_msg_t *launch) { - if (launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0] >= kernel_names.size()) { + if (launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0] >= kernel_names.size()) { TT_THROW( "Watcher data corruption, unexpected brisc kernel id on Device {} core {}: {} (last valid {})", device_id, core.str(), - launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0], + launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0], kernel_names.size()); } - used_kernel_names[launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0]] = true; + used_kernel_names[launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0]] = true; - if (launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1] >= kernel_names.size()) { + if (launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1] >= kernel_names.size()) { TT_THROW( "Watcher data corruption, unexpected ncrisc kernel id on Device {} core {}: {} (last valid {})", device_id, core.str(), - launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1], + launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1], kernel_names.size()); } - used_kernel_names[launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1]] = true; + used_kernel_names[launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1]] = true; - if (launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE] >= kernel_names.size()) { + if (launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE] >= kernel_names.size()) { TT_THROW( "Watcher data corruption, unexpected trisc kernel id on Device {} core {}: {} (last valid {})", device_id, core.str(), - launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE], + launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE], kernel_names.size()); } - used_kernel_names[launch->watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE]] = true; + used_kernel_names[launch->kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE]] = true; } static void dump_core( @@ -669,20 +669,20 @@ static void dump_core( } } else { fprintf(f, "rmsg:"); - dump_run_state(f, core, &mbox_data->launch, mbox_data->launch.run); + dump_run_state(f, core, &mbox_data->launch, mbox_data->launch.go.run); fprintf(f, " "); } // Eth core only reports erisc kernel id, uses the brisc field if (is_eth_core) { - fprintf(f, "k_id:%d", mbox_data->launch.watcher_kernel_ids[DISPATCH_CLASS_ETH_DM0]); + fprintf(f, "k_id:%d", mbox_data->launch.kernel_config.watcher_kernel_ids[DISPATCH_CLASS_ETH_DM0]); } else { fprintf( f, "k_ids:%d|%d|%d", - mbox_data->launch.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0], - mbox_data->launch.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1], - mbox_data->launch.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE]); + mbox_data->launch.kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM0], + mbox_data->launch.kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_DM1], + mbox_data->launch.kernel_config.watcher_kernel_ids[DISPATCH_CLASS_TENSIX_COMPUTE]); } // Ring buffer at the end because it can print a bunch of data diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 258c27134fe..7e952d00f89 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -285,7 +285,7 @@ void Device::initialize_firmware(CoreCoord phys_core, launch_msg_t *launch_msg) llrt::get_risc_binary(firmware_build_states_[riscv_id]->get_target_out_path("")); uint32_t kernel_size16 = llrt::get_binary_code_size16(binary_mem, riscv_id); if (riscv_id == 1) { - launch_msg->ncrisc_kernel_size16 = kernel_size16; + launch_msg->kernel_config.ncrisc_kernel_size16 = kernel_size16; } log_debug(LogDevice, "RISC {} fw binary size: {} in bytes", riscv_id, kernel_size16 * 16); llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, riscv_id); @@ -301,7 +301,7 @@ void Device::reset_cores() { ZoneScoped; auto kernel_still_running = [](launch_msg_t *launch_msg) { - return launch_msg->run == RUN_MSG_GO && launch_msg->exit_erisc_kernel == 0; + return launch_msg->go.run == RUN_MSG_GO && launch_msg->kernel_config.exit_erisc_kernel == 0; }; auto mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id_); @@ -320,7 +320,7 @@ void Device::reset_cores() { this->id(), physical_core.str(), this->id()); - launch_msg->exit_erisc_kernel = 1; + launch_msg->kernel_config.exit_erisc_kernel = 1; llrt::write_launch_msg_to_core(this->id(), physical_core, launch_msg); device_to_early_exit_cores[this->id()].insert(physical_core); } @@ -345,7 +345,7 @@ void Device::reset_cores() { this->id(), phys_core.str(), id_and_cores.first); - launch_msg->exit_erisc_kernel = 1; + launch_msg->kernel_config.exit_erisc_kernel = 1; llrt::write_launch_msg_to_core(id_and_cores.first, phys_core, launch_msg); device_to_early_exit_cores[id_and_cores.first].insert(phys_core); } @@ -387,8 +387,8 @@ void Device::initialize_and_launch_firmware() { launch_msg_t launch_msg; std::memset(&launch_msg, 0, sizeof(launch_msg_t)); - launch_msg.mode = DISPATCH_MODE_HOST, - launch_msg.run = RUN_MSG_INIT, + launch_msg.kernel_config.mode = DISPATCH_MODE_HOST, + launch_msg.go.run = RUN_MSG_INIT, // Download to worker cores log_debug("Initializing firmware"); diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 2a0346edc09..2f36d29a638 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -892,9 +892,9 @@ void EnqueueProgramCommand::assemble_device_commands() { constexpr uint32_t aligned_go_signal_sizeB = align(go_signal_sizeB, L1_ALIGNMENT); constexpr uint32_t go_signal_size_words = aligned_go_signal_sizeB / sizeof(uint32_t); for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::WORKER)) { - kernel_group.launch_msg.mode = DISPATCH_MODE_DEV; - kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x; - kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y; + kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; + kernel_group.launch_msg.kernel_config.dispatch_core_x = this->dispatch_core.x; + kernel_group.launch_msg.kernel_config.dispatch_core_y = this->dispatch_core.y; const void* launch_message_data = (const void*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { CoreCoord physical_start = @@ -919,9 +919,9 @@ void EnqueueProgramCommand::assemble_device_commands() { } for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::ETH)) { - kernel_group.launch_msg.mode = DISPATCH_MODE_DEV; - kernel_group.launch_msg.dispatch_core_x = this->dispatch_core.x; - kernel_group.launch_msg.dispatch_core_y = this->dispatch_core.y; + kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; + kernel_group.launch_msg.kernel_config.dispatch_core_x = this->dispatch_core.x; + kernel_group.launch_msg.kernel_config.dispatch_core_y = this->dispatch_core.y; const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { for (auto x = core_range.start.x; x <= core_range.end.x; x++) { @@ -1132,8 +1132,8 @@ void EnqueueProgramCommand::assemble_device_commands() { i++; } for (auto& go_signal : cached_program_command_sequence.go_signals) { - go_signal->dispatch_core_x = this->dispatch_core.x; - go_signal->dispatch_core_y = this->dispatch_core.y; + go_signal->kernel_config.dispatch_core_x = this->dispatch_core.x; + go_signal->kernel_config.dispatch_core_y = this->dispatch_core.y; } } } diff --git a/tt_metal/impl/dispatch/kernels/cq_helpers.hpp b/tt_metal/impl/dispatch/kernels/cq_helpers.hpp index 62b6ae9a92a..1b4b97b5c57 100644 --- a/tt_metal/impl/dispatch/kernels/cq_helpers.hpp +++ b/tt_metal/impl/dispatch/kernels/cq_helpers.hpp @@ -11,7 +11,7 @@ // Helper function to determine if the dispatch kernel needs to early exit, only valid for IERISC. FORCE_INLINE bool early_exit() { tt_l1_ptr mailboxes_t * const mailbox = (tt_l1_ptr mailboxes_t *)(MEM_IERISC_MAILBOX_BASE); - return mailbox->launch.exit_erisc_kernel; + return mailbox->launch.kernel_config.exit_erisc_kernel; } #define IDLE_ERISC_RETURN(...) \ diff --git a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp index 38a543a8082..0961356d899 100644 --- a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp @@ -153,7 +153,7 @@ void kernel_main() { } tt_l1_ptr launch_msg_t * const launch_msg = GET_MAILBOX_ADDRESS_DEV(launch); - if (launch_msg->exit_erisc_kernel) { + if (launch_msg->kernel_config.exit_erisc_kernel) { return; } // need to optimize this. diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 3b593aa1ff2..a355e2dbe21 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -136,9 +136,9 @@ KernelGroup::KernelGroup( // Use 0 if neither brisc nor ncrisc specify a noc if (core_type == CoreType::WORKER) { // Dynamic address map - this->launch_msg.kernel_config_base = L1_KERNEL_CONFIG_BASE; + this->launch_msg.kernel_config.kernel_config_base = L1_KERNEL_CONFIG_BASE; } else { - this->launch_msg.kernel_config_base = + this->launch_msg.kernel_config.kernel_config_base = erisc_is_idle ? IDLE_ERISC_L1_KERNEL_CONFIG_BASE : eth_l1_mem::address_map::ERISC_L1_KERNEL_CONFIG_BASE; } @@ -146,26 +146,26 @@ KernelGroup::KernelGroup( auto& optional_id = kernel_ids[class_id]; if (optional_id) { const auto kernel = program.get_kernel(optional_id.value()); - this->launch_msg.watcher_kernel_ids[class_id] = kernel->get_watcher_kernel_id(); - this->launch_msg.enables |= 1 << class_id; + this->launch_msg.kernel_config.watcher_kernel_ids[class_id] = kernel->get_watcher_kernel_id(); + this->launch_msg.kernel_config.enables |= 1 << class_id; if (core_type == CoreType::WORKER) { if (class_id == DISPATCH_CLASS_TENSIX_DM0) { // Use brisc's noc if brisc specifies a noc - this->launch_msg.brisc_noc_id = std::get(kernel->config()).noc; + this->launch_msg.kernel_config.brisc_noc_id = std::get(kernel->config()).noc; } else if (class_id == DISPATCH_CLASS_TENSIX_DM1) { // Use 1-ncrisc's noc (the other noc) if ncrisc specifies a noc // If both brisc and ncrisc set the noc, then this is safe due to prior correctness validation - this->launch_msg.brisc_noc_id = 1 - std::get(kernel->config()).noc; - this->launch_msg.ncrisc_kernel_size16 = kernel->get_binary_size16(); + this->launch_msg.kernel_config.brisc_noc_id = 1 - std::get(kernel->config()).noc; + this->launch_msg.kernel_config.ncrisc_kernel_size16 = kernel->get_binary_size16(); } } } } - this->launch_msg.exit_erisc_kernel = false; - this->launch_msg.max_cb_index = last_cb_index + 1; - this->launch_msg.run = RUN_MSG_GO; + this->launch_msg.kernel_config.exit_erisc_kernel = false; + this->launch_msg.kernel_config.max_cb_index = last_cb_index + 1; + this->launch_msg.go.run = RUN_MSG_GO; } CoreType KernelGroup::get_core_type() const { @@ -800,10 +800,10 @@ void Program::finalize_rt_args() { if (optional_id) { auto kernel = detail::GetKernel(*this, optional_id.value()); kernel->set_runtime_args_count(kg.core_ranges, max_rtas[dispatch_class]); - kg.launch_msg.mem_map[dispatch_class].rta_offset = offset; + kg.launch_msg.kernel_config.mem_map[dispatch_class].rta_offset = offset; offset += max_rtas[dispatch_class] * sizeof(uint32_t); } else { - kg.launch_msg.mem_map[dispatch_class].rta_offset = 0; + kg.launch_msg.kernel_config.mem_map[dispatch_class].rta_offset = 0; } } @@ -847,7 +847,7 @@ void Program::finalize_rt_args() { // Set the kernel group common runtime arg offsets use in the launch message for (auto& kg : this->get_kernel_groups(core_type)) { for (int dispatch_class = 0; dispatch_class < DISPATCH_CLASS_MAX; dispatch_class++) { - kg.launch_msg.mem_map[dispatch_class].crta_offset = this->crta_offsets[core_type == CoreType::WORKER][dispatch_class]; + kg.launch_msg.kernel_config.mem_map[dispatch_class].crta_offset = this->crta_offsets[core_type == CoreType::WORKER][dispatch_class]; } } diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index e8f47f63f1f..2870da94e06 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -163,17 +163,26 @@ void write_launch_msg_to_core(chip_id_t chip, const CoreCoord core, launch_msg_t assert(is_active_eth_core or is_inactive_eth_core); } - msg->mode = DISPATCH_MODE_HOST; + msg->kernel_config.mode = DISPATCH_MODE_HOST; if (is_active_eth_core) { tt::Cluster::instance().write_core( - (void *)msg, sizeof(launch_msg_t), tt_cxy_pair(chip, core), GET_ETH_MAILBOX_ADDRESS_HOST(launch)); + (void *)&msg->kernel_config, sizeof(kernel_config_msg_t), tt_cxy_pair(chip, core), GET_ETH_MAILBOX_ADDRESS_HOST(launch.kernel_config)); + tt_driver_atomics::sfence(); + tt::Cluster::instance().write_core( + (void *)&msg->go, sizeof(go_msg_t), tt_cxy_pair(chip, core), GET_ETH_MAILBOX_ADDRESS_HOST(launch.go)); } else { if (is_inactive_eth_core) { tt::Cluster::instance().write_core( - (void *)msg, sizeof(launch_msg_t), tt_cxy_pair(chip, core), GET_IERISC_MAILBOX_ADDRESS_HOST(launch)); + (void *)&msg->kernel_config, sizeof(kernel_config_msg_t), tt_cxy_pair(chip, core), GET_IERISC_MAILBOX_ADDRESS_HOST(launch.kernel_config)); + tt_driver_atomics::sfence(); + tt::Cluster::instance().write_core( + (void *)&msg->go, sizeof(go_msg_t), tt_cxy_pair(chip, core), GET_IERISC_MAILBOX_ADDRESS_HOST(launch.go)); } else { tt::Cluster::instance().write_core( - (void *)msg, sizeof(launch_msg_t), tt_cxy_pair(chip, core), GET_MAILBOX_ADDRESS_HOST(launch)); + (void *)&msg->kernel_config, sizeof(kernel_config_msg_t), tt_cxy_pair(chip, core), GET_MAILBOX_ADDRESS_HOST(launch.kernel_config)); + tt_driver_atomics::sfence(); + tt::Cluster::instance().write_core( + (void *)&msg->go, sizeof(go_msg_t), tt_cxy_pair(chip, core), GET_MAILBOX_ADDRESS_HOST(launch.go)); } } } @@ -313,15 +322,15 @@ static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreC assert(is_active_eth_core or is_inactive_eth_core); } - uint64_t run_mailbox_addr = is_active_eth_core ? GET_ETH_MAILBOX_ADDRESS_HOST(launch.run) : - is_inactive_eth_core ? GET_IERISC_MAILBOX_ADDRESS_HOST(launch.run) : GET_MAILBOX_ADDRESS_HOST(launch.run); + uint64_t run_mailbox_addr = is_active_eth_core ? GET_ETH_MAILBOX_ADDRESS_HOST(launch.go.run) : + is_inactive_eth_core ? GET_IERISC_MAILBOX_ADDRESS_HOST(launch.go.run) : GET_MAILBOX_ADDRESS_HOST(launch.go.run); auto get_mailbox_is_done = [&](uint64_t run_mailbox_address) { constexpr int RUN_MAILBOX_BOGUS = 3; std::vector run_mailbox_read_val = {RUN_MAILBOX_BOGUS}; // read a single uint32_t even though launch.run is smaller than that run_mailbox_read_val = read_hex_vec_from_core(chip_id, core, run_mailbox_address & ~0x3, sizeof(uint32_t)); - uint8_t run = run_mailbox_read_val[0] >> (8 * (offsetof(launch_msg_t, run) & 3)); + uint8_t run = run_mailbox_read_val[0] >> (8 * (offsetof(launch_msg_t, go.run) & 3)); if (run != run_state && run != RUN_MSG_DONE) { fprintf( stderr, diff --git a/tt_metal/llrt/tlb_config.cpp b/tt_metal/llrt/tlb_config.cpp index 61f74c7c887..d39018d10b0 100644 --- a/tt_metal/llrt/tlb_config.cpp +++ b/tt_metal/llrt/tlb_config.cpp @@ -185,7 +185,13 @@ void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_ // Setup static TLBs for all worker cores for (auto &core : statically_mapped_cores) { auto tlb_index = get_static_tlb_index(core); - device_driver.configure_tlb(mmio_device_id, core, tlb_index, address); + // TODO + // Note: see issue #10107 + // Strict is less performant than Posted, however, metal doesn't presently + // use this on a perf path and the launch_msg "kernel config" needs to + // arrive prior to the "go" message during device init and slow dispatch + // Revisit this when we have a more flexible UMD api + device_driver.configure_tlb(mmio_device_id, core, tlb_index, address, TLB_DATA::Strict); } // TODO (#9932): Remove workaround for BH diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 17521341d56..54d60c44523 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -647,7 +647,7 @@ void WriteRuntimeArgsToDevice(Device *device, Program &program) { } if (rt_args.size() > 0) { - auto args_base_addr = kernel_config_base + kg.launch_msg.mem_map[dispatch_class].rta_offset; + auto args_base_addr = kernel_config_base + kg.launch_msg.kernel_config.mem_map[dispatch_class].rta_offset; log_trace( tt::LogMetal, "{} - Writing {} unique rtargs to core {} (physical: {}) addr 0x{:x} => args: {}", @@ -662,7 +662,7 @@ void WriteRuntimeArgsToDevice(Device *device, Program &program) { const auto &common_rt_args = kernel->common_runtime_args(); if (common_rt_args.size() > 0) { - auto common_rt_args_addr = kernel_config_base + kg.launch_msg.mem_map[dispatch_class].crta_offset; + auto common_rt_args_addr = kernel_config_base + kg.launch_msg.kernel_config.mem_map[dispatch_class].crta_offset; log_trace( tt::LogMetal, "{} - Writing {} common rtargs to core {} (physical: {}) addr 0x{:x} => args: {}", From 374d29e987886b57fe187b3294e1b4631962efaa Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Wed, 26 Jun 2024 21:29:06 +0000 Subject: [PATCH 2/2] #3764: Device side opID support --- tests/scripts/run_profiler_regressions.sh | 43 +++++++------- tt_eager/tt_dnn/op_library/run_operation.cpp | 9 +++ tt_metal/hw/firmware/src/brisc.cc | 1 + tt_metal/hw/firmware/src/erisc.cc | 1 + tt_metal/hw/firmware/src/idle_erisc.cc | 1 + tt_metal/hw/inc/dev_msgs.h | 3 + tt_metal/impl/dispatch/command_queue.cpp | 6 +- tt_metal/impl/program/program.cpp | 4 +- tt_metal/impl/program/program.hpp | 3 + tt_metal/third_party/tracy | 2 +- tt_metal/tools/profiler/kernel_profiler.hpp | 58 ++++++++++++------- tt_metal/tools/profiler/process_device_log.py | 10 +++- tt_metal/tools/profiler/process_ops_logs.py | 4 ++ tt_metal/tools/profiler/profiler.cpp | 14 +++-- tt_metal/tools/profiler/profiler.hpp | 1 + tt_metal/tt_metal.cpp | 2 +- ttnn/cpp/ttnn/device_operation.hpp | 2 + 17 files changed, 111 insertions(+), 53 deletions(-) diff --git a/tests/scripts/run_profiler_regressions.sh b/tests/scripts/run_profiler_regressions.sh index f8fc6a69759..fc8fa936a59 100755 --- a/tests/scripts/run_profiler_regressions.sh +++ b/tests/scripts/run_profiler_regressions.sh @@ -4,26 +4,6 @@ source scripts/tools_setup_common.sh set -eo pipefail -run_additional_T3000_test(){ - remove_default_log_locations - mkdir -p $PROFILER_ARTIFACTS_DIR - - ./tt_metal/tools/profiler/profile_this.py -c "pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py::test_all_gather_on_t3000_post_commit[mem_config0-input_dtype0-8-1-input_shape1-0-layout1]" > $PROFILER_ARTIFACTS_DIR/test_out.log - - cat $PROFILER_ARTIFACTS_DIR/test_out.log - - if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED" - then - echo "No verification as test was skipped" - else - echo "Verifying test results" - runDate=$(ls $PROFILER_OUTPUT_DIR/) - LINE_COUNT=9 #1 header + 8 devices - res=$(verify_perf_line_count "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT") - echo $res - fi -} - run_async_mode_T3000_test(){ #Some tests here do not skip grayskull if [ "$ARCH_NAME" != "grayskull" ]; then @@ -47,6 +27,27 @@ run_async_mode_T3000_test(){ fi } +run_additional_T3000_test(){ + remove_default_log_locations + mkdir -p $PROFILER_ARTIFACTS_DIR + + ./tt_metal/tools/profiler/profile_this.py -c "pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py::test_all_gather_on_t3000_post_commit[mem_config0-input_dtype0-8-1-input_shape1-0-layout1]" > $PROFILER_ARTIFACTS_DIR/test_out.log + + cat $PROFILER_ARTIFACTS_DIR/test_out.log + + if cat $PROFILER_ARTIFACTS_DIR/test_out.log | grep "SKIPPED" + then + echo "No verification as test was skipped" + else + echo "Verifying test results" + runDate=$(ls $PROFILER_OUTPUT_DIR/) + LINE_COUNT=9 #1 header + 8 devices + res=$(verify_perf_line_count "$PROFILER_OUTPUT_DIR/$runDate/ops_perf_results_$runDate.csv" "$LINE_COUNT") + echo $res + run_async_mode_T3000_test + fi +} + run_profiling_test(){ if [[ -z "$ARCH_NAME" ]]; then echo "Must provide ARCH_NAME in environment" 1>&2 @@ -60,8 +61,6 @@ run_profiling_test(){ run_additional_T3000_test - run_async_mode_T3000_test - TT_METAL_DEVICE_PROFILER=1 pytest $PROFILER_TEST_SCRIPTS_ROOT/test_device_profiler.py remove_default_log_locations diff --git a/tt_eager/tt_dnn/op_library/run_operation.cpp b/tt_eager/tt_dnn/op_library/run_operation.cpp index 2c6085773b3..cc6954969e9 100644 --- a/tt_eager/tt_dnn/op_library/run_operation.cpp +++ b/tt_eager/tt_dnn/op_library/run_operation.cpp @@ -224,6 +224,15 @@ OutputTensors run_device_operation( operation, input_tensors, optional_input_tensors, output_tensors, optional_output_tensors); uint32_t device_id = detail::get_device(input_tensors, optional_input_tensors)->id(); + if (std::holds_alternative>(program)) + { + std::get>(program).get().set_global_id(op_id); + } + else + { + std::get>(program)->set_global_id(op_id); + } + // Enqueue or Launch Program std::visit( [&operation, &input_tensors, &optional_input_tensors, &output_tensors, queue](auto&& program) { diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 41134ccaf90..739828edc20 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -366,6 +366,7 @@ int main() { { DeviceZoneScopedMainN("BRISC-FW"); + DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id); // Copies from L1 to IRAM on chips where NCRISC has IRAM l1_to_ncrisc_iram_copy(mailboxes->launch.kernel_config.ncrisc_kernel_size16, ncrisc_kernel_start_offset16); diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index 4523be27cfa..c3ebdb826a1 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -79,6 +79,7 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) { // FD: assume that no more host -> remote writes are pending if (mailboxes->launch.go.run == RUN_MSG_GO) { DeviceZoneScopedMainN("ERISC-FW"); + DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id); DEBUG_STATUS("R"); uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index bf7b746cb71..3d825bb98ba 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -108,6 +108,7 @@ int main() { { DeviceZoneScopedMainN("ERISC-IDLE-FW"); + DeviceZoneSetCounter(mailboxes->launch.kernel_config.host_assigned_op_id); noc_index = mailboxes->launch.kernel_config.brisc_noc_id; diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index c6a6cde74a2..87fe99896ba 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -80,6 +80,8 @@ struct kernel_config_msg_t { volatile uint16_t watcher_kernel_ids[DISPATCH_CLASS_MAX]; volatile uint16_t ncrisc_kernel_size16; // size in 16 byte units + volatile uint16_t host_assigned_op_id; + // Ring buffer of kernel configuration data volatile uint32_t kernel_config_base; dyn_mem_map_t mem_map[DISPATCH_CLASS_MAX]; @@ -92,6 +94,7 @@ struct kernel_config_msg_t { volatile uint8_t dispatch_core_y; volatile uint8_t exit_erisc_kernel; volatile uint8_t pad1; + volatile uint16_t pad2; } __attribute__((packed)); struct go_msg_t { diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 2f36d29a638..c4fa3c5f62b 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -651,6 +651,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() { } void EnqueueProgramCommand::assemble_device_commands() { + ZoneScoped; auto& cached_program_command_sequence = this->cached_program_command_sequences[this->program.id]; if (!program.is_finalized()) { // Calculate size of command and fill program indices of data to update @@ -895,6 +896,7 @@ void EnqueueProgramCommand::assemble_device_commands() { kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; kernel_group.launch_msg.kernel_config.dispatch_core_x = this->dispatch_core.x; kernel_group.launch_msg.kernel_config.dispatch_core_y = this->dispatch_core.y; + kernel_group.launch_msg.kernel_config.host_assigned_op_id = program.get_global_id(); const void* launch_message_data = (const void*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { CoreCoord physical_start = @@ -917,11 +919,11 @@ void EnqueueProgramCommand::assemble_device_commands() { this->packed_write_max_unicast_sub_cmds, multicast_go_signals_payload); } - for (KernelGroup& kernel_group : program.get_kernel_groups(CoreType::ETH)) { kernel_group.launch_msg.kernel_config.mode = DISPATCH_MODE_DEV; kernel_group.launch_msg.kernel_config.dispatch_core_x = this->dispatch_core.x; kernel_group.launch_msg.kernel_config.dispatch_core_y = this->dispatch_core.y; + kernel_group.launch_msg.kernel_config.host_assigned_op_id = program.get_global_id(); const void* launch_message_data = (const launch_msg_t*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { for (auto x = core_range.start.x; x <= core_range.end.x; x++) { @@ -1113,6 +1115,7 @@ void EnqueueProgramCommand::assemble_device_commands() { } } else { uint32_t i = 0; + ZoneScopedN("program_loaded_on_device"); for (const auto& cbs_on_core_range : cached_program_command_sequence.circular_buffers_on_core_ranges) { uint32_t* cb_config_payload = cached_program_command_sequence.cb_configs_payloads[i]; for (const shared_ptr& cb : cbs_on_core_range) { @@ -1134,6 +1137,7 @@ void EnqueueProgramCommand::assemble_device_commands() { for (auto& go_signal : cached_program_command_sequence.go_signals) { go_signal->kernel_config.dispatch_core_x = this->dispatch_core.x; go_signal->kernel_config.dispatch_core_y = this->dispatch_core.y; + go_signal->kernel_config.host_assigned_op_id = program.get_global_id(); } } } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index a355e2dbe21..614700e7186 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -81,7 +81,7 @@ void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; } std::atomic Program::program_counter = 0; Program::Program() : - id(program_counter++), worker_crs_({}), local_circular_buffer_allocation_needed_(false), loaded_onto_device(false) { + id(program_counter++), global_id(0), worker_crs_({}), local_circular_buffer_allocation_needed_(false), loaded_onto_device(false) { std::set supported_core_types = {CoreType::WORKER, CoreType::ETH}; for (const auto &core_type : supported_core_types) { kernels_.insert({core_type, {}}); @@ -934,5 +934,7 @@ void Program::compile(Device *device) { this->loaded_onto_device = false; } +void Program::set_global_id(uint64_t id) { this->global_id = id; } + Program::~Program() {} } // namespace tt::tt_metal diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index 48cb966b976..ea19daff48d 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -80,11 +80,13 @@ class Program { Program(Program &&other) = default; Program& operator=(Program &&other) = default; + void set_global_id(uint64_t id); ~Program(); void construct_core_range_set_for_worker_cores(); const uint64_t get_id() const { return this->id; } + const uint64_t get_global_id() const { return this->global_id; } size_t num_kernels() const { size_t count = 0; @@ -177,6 +179,7 @@ class Program { }; uint64_t id; // Need to make non-const due to move constructor + uint64_t global_id; // Need to make non-const due to move constructor static std::atomic program_counter; std::unordered_map >> kernels_; std::unordered_map grid_extent_; diff --git a/tt_metal/third_party/tracy b/tt_metal/third_party/tracy index 2591e70eaca..71d4c8d378b 160000 --- a/tt_metal/third_party/tracy +++ b/tt_metal/third_party/tracy @@ -1 +1 @@ -Subproject commit 2591e70eaca0a12705ea23cbe4059e086c9a2a9f +Subproject commit 71d4c8d378b52af7da7012b9b595a61e9304f0bb diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 343dcbbd3c5..b307be780eb 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -111,7 +111,6 @@ namespace kernel_profiler{ { core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; -#pragma GCC unroll 65534 for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) { eriscBuffer[i] = 0; @@ -124,14 +123,13 @@ namespace kernel_profiler{ profiler_control_buffer[FLAT_ID] = core_flat_id; } -#pragma GCC unroll 65534 for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) { //TODO(MO): Clean up magic numbers eriscBuffer[i] = 0x80000000; } - eriscBuffer [ID_LL] = runCounter; + eriscBuffer [ID_LL] = (runCounter & 0xFFFF) | (eriscBuffer [ID_LL] & 0xFFFF0000); #endif //ERISC_INIT #if defined(COMPILE_FOR_BRISC) @@ -146,7 +144,6 @@ namespace kernel_profiler{ { core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; -#pragma GCC unroll 65534 for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) { briscBuffer[i] = 0; @@ -167,10 +164,9 @@ namespace kernel_profiler{ profiler_control_buffer[FLAT_ID] = core_flat_id; } -#pragma GCC unroll 65534 for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) { - //TODO(MO): Clean up magic numbers + //TODO(MO): Clean up magic numbers briscBuffer[i] = 0x80000000; ncriscBuffer[i] = 0x80000000; trisc0Buffer[i] = 0x80000000; @@ -178,12 +174,11 @@ namespace kernel_profiler{ trisc2Buffer[i] = 0x80000000; } - //TODO(MO): Clean up magic numbers - briscBuffer [ID_LL] = runCounter; - ncriscBuffer[ID_LL] = runCounter; - trisc0Buffer[ID_LL] = runCounter; - trisc1Buffer[ID_LL] = runCounter; - trisc2Buffer[ID_LL] = runCounter; + briscBuffer [ID_LL] = (runCounter & 0xFFFF) | (briscBuffer [ID_LL] & 0xFFFF0000); + ncriscBuffer[ID_LL] = (runCounter & 0xFFFF) | (ncriscBuffer[ID_LL] & 0xFFFF0000); + trisc0Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc0Buffer[ID_LL] & 0xFFFF0000); + trisc1Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc1Buffer[ID_LL] & 0xFFFF0000); + trisc2Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc2Buffer[ID_LL] & 0xFFFF0000); #endif //BRISC_INIT @@ -208,14 +203,6 @@ namespace kernel_profiler{ buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX]; } - inline __attribute__((always_inline)) void mark_end_at_index_inlined(uint32_t index, uint32_t timer_id_s, uint32_t timer_id) - { - volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); - volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); - buffer[index+2] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF); - buffer[index+3] = p_reg[WALL_CLOCK_LOW_INDEX]; - } - inline __attribute__((always_inline)) void mark_padding() { if (wIndex < PROFILER_L1_VECTOR_SIZE) @@ -233,6 +220,28 @@ namespace kernel_profiler{ profiler_control_buffer[DROPPED_ZONES] = (1 << index) | curr; } + inline __attribute__((always_inline)) void set_host_counter(uint32_t counterValue) + { +#if defined(COMPILE_FOR_ERISC) + volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER); + + eriscBuffer[ID_LL] = (counterValue << 16) | (eriscBuffer[ID_LL] & 0xFFFF); +#endif //ERISC_INIT + +#if defined(COMPILE_FOR_BRISC) + volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR); + volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_NC); + volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T0); + volatile tt_l1_ptr uint32_t *trisc1Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T1); + volatile tt_l1_ptr uint32_t *trisc2Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T2); + + briscBuffer[ID_LL] = (counterValue << 16) | (briscBuffer[ID_LL] & 0xFFFF); + ncriscBuffer[ID_LL] = (counterValue << 16) | (ncriscBuffer[ID_LL] & 0xFFFF); + trisc0Buffer[ID_LL] = (counterValue << 16) | (trisc0Buffer[ID_LL] & 0xFFFF); + trisc1Buffer[ID_LL] = (counterValue << 16) | (trisc1Buffer[ID_LL] & 0xFFFF); + trisc2Buffer[ID_LL] = (counterValue << 16) | (trisc2Buffer[ID_LL] & 0xFFFF); +#endif //ERISC_INIT + } inline __attribute__((always_inline)) void risc_finished_profiling() { @@ -374,6 +383,7 @@ namespace kernel_profiler{ #if defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES) SrcLocNameToHash("PROFILER-NOC-QUICK-SEND"); mark_time_at_index_inlined(wIndex, hash); + wIndex += PROFILER_L1_MARKER_UINT32_SIZE; core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; uint32_t dram_offset = @@ -389,8 +399,8 @@ namespace kernel_profiler{ uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); - mark_end_at_index_inlined(wIndex, hash, get_end_timer_id(hash)); - wIndex += QUICK_PUSH_MARKER_COUNT * PROFILER_L1_MARKER_UINT32_SIZE; + mark_time_at_index_inlined(wIndex, get_end_timer_id(hash)); + wIndex += PROFILER_L1_MARKER_UINT32_SIZE; uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] + wIndex; @@ -523,6 +533,8 @@ namespace kernel_profiler{ #define DeviceZoneScopedSumN2( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME(name)); kernel_profiler::profileScopeAccumulate zone = kernel_profiler::profileScopeAccumulate(); +#define DeviceZoneSetCounter( counter ) kernel_profiler::set_host_counter(counter); + #else #define DeviceZoneScopedMainN( name ) @@ -537,4 +549,6 @@ namespace kernel_profiler{ #define DeviceZoneScopedND( name , nocBuffer, nocIndex ) +#define DeviceZoneSetCounter( counter ) + #endif diff --git a/tt_metal/tools/profiler/process_device_log.py b/tt_metal/tools/profiler/process_device_log.py index 65a47046be0..8495150c608 100755 --- a/tt_metal/tools/profiler/process_device_log.py +++ b/tt_metal/tools/profiler/process_device_log.py @@ -209,7 +209,15 @@ def import_device_profile_log(logPath): timerID = {"id": int(row[4].strip()), "zone_name": "", "zone_phase": "", "src_line": "", "src_file": ""} timeData = int(row[5].strip()) statData = 0 - if len(row) > 6: + if len(row) == 13: + statData = int(row[6].strip()) + timerID["run_id"] = int(row[7].strip()) + timerID["op_id"] = int(row[8].strip()) + timerID["zone_name"] = row[9].strip() + timerID["zone_phase"] = row[10].strip() + timerID["src_line"] = int(row[11].strip()) + timerID["src_file"] = row[12].strip() + elif len(row) == 12: statData = int(row[6].strip()) timerID["run_id"] = int(row[7].strip()) timerID["zone_name"] = row[8].strip() diff --git a/tt_metal/tools/profiler/process_ops_logs.py b/tt_metal/tools/profiler/process_ops_logs.py index 5a0259a5f9b..6b531be4ced 100755 --- a/tt_metal/tools/profiler/process_ops_logs.py +++ b/tt_metal/tools/profiler/process_ops_logs.py @@ -197,6 +197,10 @@ def append_device_data(ops, deviceLogFolder): cores = set() for timeID, ts, statData, risc, core in deviceOpTime["timeseries"]: if "zone_name" in timeID.keys() and "FW" in timeID["zone_name"]: + if "op_id" in timeID.keys(): + assert ( + timeID["op_id"] == deviceOp["global_call_count"] + ), f"op id {timeID['op_id']} reproted by device is not matching assigned op id {deviceOp['global_call_count']}" if core not in cores: cores.add(core) deviceOp["core_usage"] = {"count": len(cores), "cores": [str(core) for core in cores]} diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index ed8000cf91f..b5e64d7143b 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -88,6 +88,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t riscNumRead = 0; uint32_t coreFlatIDRead = 0; uint32_t runCounterRead = 0; + uint32_t runHostCounterRead = 0; bool newRunStart = false; @@ -108,7 +109,8 @@ void DeviceProfiler::readRiscProfilerResults( //TODO(MO): Cleanup magic numbers riscNumRead = profile_buffer[index] & 0x7; coreFlatIDRead = (profile_buffer[index] >> 3) & 0xFF; - runCounterRead = profile_buffer[index + 1]; + runCounterRead = profile_buffer[index + 1] & 0xFFFF; + runHostCounterRead = (profile_buffer[index + 1] >> 16 ) & 0xFFFF; } else @@ -149,6 +151,7 @@ void DeviceProfiler::readRiscProfilerResults( dumpResultToFile( runCounterRead, + runHostCounterRead, device_id, worker_core, coreFlatID, @@ -167,6 +170,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t time_L = opTime_L; dumpResultToFile( runCounterRead, + runHostCounterRead, device_id, worker_core, coreFlatID, @@ -201,6 +205,7 @@ void DeviceProfiler::firstTimestamp(uint64_t timestamp) void DeviceProfiler::dumpResultToFile( uint32_t run_id, + uint32_t run_host_id, int device_id, CoreCoord core, int core_flat, @@ -237,7 +242,7 @@ void DeviceProfiler::dumpResultToFile( source_line = stoi(source_line_str); } - tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase); + tracy::TTDeviceEvent event = tracy::TTDeviceEvent(run_host_id, device_id, core.x, core.y, risc_num, timer_id, timestamp, source_line, source_file, zone_name, zone_phase); auto ret = device_events.insert(event); @@ -249,7 +254,7 @@ void DeviceProfiler::dumpResultToFile( { log_file.open(log_path); log_file << "ARCH: " << get_string_lowercase(device_architecture) << ", CHIP_FREQ[MHz]: " << device_core_frequency << std::endl; - log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], stat value, Run ID, zone name, zone phase, source line, source file" << std::endl; + log_file << "PCIe slot, core_x, core_y, RISC processor type, timer_id, time[cycles since reset], stat value, run ID, run host ID, zone name, zone phase, source line, source file" << std::endl; new_log = false; } else @@ -258,7 +263,7 @@ void DeviceProfiler::dumpResultToFile( } //log_file << fmt::format("{:4},{:3},{:3},{:>7},{:7},{:15},{:15},{:5},{:>25},{:>6},{:6},{}", - log_file << fmt::format("{},{},{},{},{},{},{},{},{},{},{},{}", + log_file << fmt::format("{},{},{},{},{},{},{},{},{},{},{},{},{}", device_id, core.x, core.y, @@ -267,6 +272,7 @@ void DeviceProfiler::dumpResultToFile( timestamp, stat_value, run_id, + run_host_id, zone_name, magic_enum::enum_name(zone_phase), source_line, diff --git a/tt_metal/tools/profiler/profiler.hpp b/tt_metal/tools/profiler/profiler.hpp index 79bbf2a3777..8370e3ec97c 100644 --- a/tt_metal/tools/profiler/profiler.hpp +++ b/tt_metal/tools/profiler/profiler.hpp @@ -72,6 +72,7 @@ class DeviceProfiler { // Dumping profile result to file void dumpResultToFile( uint32_t runID, + uint32_t runHostID, int device_id, CoreCoord core, int core_flat, diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 54d60c44523..c2d9dc1f18a 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -532,7 +532,7 @@ void LaunchProgram(Device *device, Program &program, bool wait_until_cores_done) for (const auto &[core_type, logical_cores] : logical_cores_used_in_program) { for (const auto &logical_core : logical_cores) { launch_msg_t *msg = &program.kernels_on_core(logical_core, core_type)->launch_msg; - + msg->kernel_config.host_assigned_op_id = program.get_global_id(); auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); not_done_cores.insert(physical_core); tt::llrt::write_launch_msg_to_core(device->id(), physical_core, msg); diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp index a29bb23dca0..82a057ee433 100644 --- a/ttnn/cpp/ttnn/device_operation.hpp +++ b/ttnn/cpp/ttnn/device_operation.hpp @@ -293,6 +293,8 @@ typename device_operation_t::tensor_return_value_t run( auto& program = create_or_get_program_from_cache( program_cache, program_cache_hit, program_hash, operation_attributes, tensor_args, tensor_return_value); + program.set_global_id(operation_id); + if (USE_FAST_DISPATCH) { ZoneScopedN("EnqueueProgram"); auto& queue = device->command_queue(cq_id);