From e1f2f085296f9d0093576e4f2c545c750de71c7d Mon Sep 17 00:00:00 2001 From: Aditya Saigal <129097327+tt-asaigal@users.noreply.github.com> Date: Mon, 30 Sep 2024 17:15:53 -0400 Subject: [PATCH] #0: Move remote chip event synchronization to dispatch core (#13256) - Previous R-Chip implementation would require event syncs to be processed on prefetch_h, since tunneler did not have VCs and a latent deadlock - With VCs in place, we can perform event synchronization on dispatch_d -> more performant, since prefetcher can keep buffering commands while dispatch_d is waiting --- .../gtests/test_multi_cq_multi_dev.cpp | 2 +- tt_metal/impl/device/device.cpp | 13 ++++-- tt_metal/impl/dispatch/command_queue.cpp | 27 ++---------- tt_metal/impl/dispatch/cq_commands.hpp | 29 ++----------- tt_metal/impl/dispatch/debug_tools.cpp | 8 ---- tt_metal/impl/dispatch/device_command.hpp | 42 ------------------- .../impl/dispatch/kernels/cq_dispatch.cpp | 19 --------- .../impl/dispatch/kernels/cq_prefetch.cpp | 19 --------- 8 files changed, 16 insertions(+), 143 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp index 2c11a97d330..e06824058b6 100644 --- a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp +++ b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp @@ -73,7 +73,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ1) { ttnn::record_event(device->command_queue(1), workload_event); ttnn::wait_for_event(device->command_queue(0), workload_event); - ttnn::read_buffer(1, output_tensor, {readback_data, readback_data, readback_data, readback_data, readback_data, readback_data, readback_data, readback_data}); + ttnn::read_buffer(0, output_tensor, {readback_data, readback_data, readback_data, readback_data, readback_data, readback_data, readback_data, readback_data}); for (int j = 0; j < 3 * 2048 * 2048; j++) { ASSERT_EQ(readback_data[j].to_float(), -1 * (i + dev_idx) * 32 + 500); diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index cccacf8e7d0..2775a4ac380 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -1913,8 +1913,11 @@ void Device::configure_command_queue_programs() { tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); tt_cxy_pair completion_q_writer_location = dispatch_core_manager::instance().completion_queue_writer_core(device_id, channel, cq_id); tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id); + tt_cxy_pair remote_dispatcher_location; + if (not this->is_mmio_capable()) { + remote_dispatcher_location = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id); + } CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(mmio_device_id); - TT_ASSERT(prefetch_location.chip == mmio_device_id and completion_q_writer_location.chip == mmio_device_id, "Issue queue interface is on device {} and completion queue interface is on device {} but they are expected to be on device {}", prefetch_location.chip, completion_q_writer_location.chip, mmio_device_id); @@ -1927,9 +1930,11 @@ void Device::configure_command_queue_programs() { detail::WriteToDeviceL1(mmio_device, prefetch_location, CQ_PREFETCH_Q_RD_PTR, prefetch_q_rd_ptr_addr_data, dispatch_core_type); detail::WriteToDeviceL1(mmio_device, prefetch_location, CQ_PREFETCH_Q_PCIE_RD_PTR, prefetch_q_pcie_rd_ptr_addr_data, dispatch_core_type); detail::WriteToDeviceL1(mmio_device, prefetch_location, dispatch_constants::PREFETCH_Q_BASE, prefetch_q, dispatch_core_type); - // Used for prefetch_h, since a wait_for_event on remote chips requires prefetch_h to spin until dispatch_d notfiies completion - detail::WriteToDeviceL1(mmio_device, prefetch_location, CQ0_COMPLETION_LAST_EVENT, zero, dispatch_core_type); - detail::WriteToDeviceL1(mmio_device, prefetch_location, CQ1_COMPLETION_LAST_EVENT, zero, dispatch_core_type); + if (not this->is_mmio_capable()) { + // Initialize event counters to 0 on dispatch_d on r-chip + detail::WriteToDeviceL1(this, remote_dispatcher_location, CQ0_COMPLETION_LAST_EVENT, zero, dispatch_core_type); + detail::WriteToDeviceL1(this, remote_dispatcher_location, CQ1_COMPLETION_LAST_EVENT, zero, dispatch_core_type); + } // Initialize completion queue write pointer and read pointer copy uint32_t issue_queue_size = this->sysmem_manager_->get_issue_queue_size(cq_id); uint32_t completion_queue_start_addr = CQ_START + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size); diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index bc5344a552f..2a1353b164d 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -1521,12 +1521,6 @@ void EnqueueRecordEventCommand::process() { sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd) + dispatch_constants::EVENT_PADDED_SIZE, pcie_alignment); // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WRITE_LINEAR_HOST + event ID - if (not device->is_mmio_capable()) { - cmd_sequence_sizeB += - CQ_PREFETCH_CMD_BARE_MIN_SIZE * - num_hw_cqs; // CQ_DISPATCH_REMOTE_WRITE (number of writes = number of prefetch_h cores on this CQ) - } - void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); @@ -1564,18 +1558,6 @@ void EnqueueRecordEventCommand::process() { event_payloads, packed_write_max_unicast_sub_cmds); - if (not device->is_mmio_capable()) { - for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { - tt_cxy_pair prefetch_location = - dispatch_core_manager::instance().prefetcher_core(this->device->id(), channel, cq_id); - CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_location, core_type); - command_sequence.add_dispatch_write_remote( - this->event_id, - this->device->get_noc_unicast_encoding(this->noc_index, prefetch_physical_core), - address); - } - } - bool flush_prefetch = true; command_sequence.add_dispatch_write_host( flush_prefetch, dispatch_constants::EVENT_PADDED_SIZE, true, event_payload.data()); @@ -1606,18 +1588,15 @@ EnqueueWaitForEventCommand::EnqueueWaitForEventCommand( void EnqueueWaitForEventCommand::process() { uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT - // or CQ_PREFETCH_CMD_WAIT_FOR_EVENT void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id); HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB); uint32_t last_completed_event_address = sync_event.cq_id == 0 ? CQ0_COMPLETION_LAST_EVENT : CQ1_COMPLETION_LAST_EVENT; - if (this->device->is_mmio_capable()) { - command_sequence.add_dispatch_wait(false, last_completed_event_address, sync_event.event_id, this->clear_count); - } else { - command_sequence.add_prefetch_wait_for_event(sync_event.event_id, last_completed_event_address); - } + + command_sequence.add_dispatch_wait(false, last_completed_event_address, sync_event.event_id, this->clear_count); + this->manager.issue_queue_push_back(cmd_sequence_sizeB, this->command_queue_id); this->manager.fetch_queue_reserve_back(this->command_queue_id); diff --git a/tt_metal/impl/dispatch/cq_commands.hpp b/tt_metal/impl/dispatch/cq_commands.hpp index 3f71158ed65..2f9659de091 100644 --- a/tt_metal/impl/dispatch/cq_commands.hpp +++ b/tt_metal/impl/dispatch/cq_commands.hpp @@ -27,8 +27,7 @@ enum CQPrefetchCmdId : uint8_t { CQ_PREFETCH_CMD_EXEC_BUF_END = 7, // finish executing commands from a buffer (return), payload like relay_inline CQ_PREFETCH_CMD_STALL = 8, // drain pipe through dispatcher CQ_PREFETCH_CMD_DEBUG = 9, // log waypoint data to watcher, checksum - CQ_PREFETCH_CMD_WAIT_FOR_EVENT = 10, // wait_for_event: stall until dispatcher signals event completion - CQ_PREFETCH_CMD_TERMINATE = 11, // quit + CQ_PREFETCH_CMD_TERMINATE = 10, // quit CQ_PREFETCH_CMD_MAX_COUNT, // for checking legal IDs }; @@ -47,9 +46,8 @@ enum CQDispatchCmdId : uint8_t { CQ_DISPATCH_CMD_DEBUG = 10, // log waypoint data to watcher, checksum CQ_DISPATCH_CMD_DELAY = 11, // insert delay (for testing) CQ_DISPATCH_CMD_EXEC_BUF_END = 12, // dispatch_d notify prefetch_h that exec_buf has completed - CQ_DISPATCH_CMD_REMOTE_WRITE = 13, // dispatch_d issues write to address on L-Chip through dispatch_h - CQ_DISPATCH_CMD_SET_WRITE_OFFSET = 14, // set the offset to add to all non-host destination addresses (relocation) - CQ_DISPATCH_CMD_TERMINATE = 15, // quit + CQ_DISPATCH_CMD_SET_WRITE_OFFSET = 13, // set the offset to add to all non-host destination addresses (relocation) + CQ_DISPATCH_CMD_TERMINATE = 14, // quit CQ_DISPATCH_CMD_MAX_COUNT, // for checking legal IDs }; @@ -115,13 +113,6 @@ struct CQPrefetchRelayInlineCmd { uint32_t stride; // explicit stride saves a few insns on device } __attribute__((packed)); -struct CQPrefetchWaitForEventCmd { - uint8_t pad1; - uint16_t pad2; - uint32_t sync_event; - uint32_t sync_event_addr; -} __attribute__((packed)); - struct CQPrefetchExecBufCmd { uint8_t pad1; uint16_t pad2; @@ -138,7 +129,6 @@ struct CQPrefetchCmd { CQPrefetchRelayPagedPackedCmd relay_paged_packed; CQPrefetchRelayInlineCmd relay_inline; CQPrefetchExecBufCmd exec_buf; - CQPrefetchWaitForEventCmd event_wait; CQGenericDebugCmd debug; } __attribute__((packed)); }; @@ -241,18 +231,6 @@ struct CQDispatchDelayCmd { uint32_t delay; } __attribute__((packed)); -// When dispatch_d gets this command, it will be -// forwarded to dispatch_h, which will write data -// to local noc_xy_addr at offset addr. The data -// is currently a uint32_t field, which is inlined in -// the command. -struct CQDispatchRemoteWriteCmd { - uint32_t data; - uint32_t noc_xy_addr; - uint32_t addr; -} __attribute__((packed)); - - struct CQDispatchSetWriteOffsetCmd { uint8_t pad1; uint16_t pad2; @@ -270,7 +248,6 @@ struct CQDispatchCmd { CQDispatchWritePagedCmd write_paged; CQDispatchWritePackedCmd write_packed; CQDispatchWritePackedLargeCmd write_packed_large; - CQDispatchRemoteWriteCmd write_from_remote; CQDispatchWaitCmd wait; CQGenericDebugCmd debug; CQDispatchDelayCmd delay; diff --git a/tt_metal/impl/dispatch/debug_tools.cpp b/tt_metal/impl/dispatch/debug_tools.cpp index 1216b600141..5e58aa53f97 100644 --- a/tt_metal/impl/dispatch/debug_tools.cpp +++ b/tt_metal/impl/dispatch/debug_tools.cpp @@ -183,7 +183,6 @@ uint32_t dump_dispatch_cmd(CQDispatchCmd *cmd, uint32_t cmd_addr, std::ofstream case CQ_DISPATCH_CMD_GO: break; case CQ_DISPATCH_CMD_SINK: break; case CQ_DISPATCH_CMD_EXEC_BUF_END: break; - case CQ_DISPATCH_CMD_REMOTE_WRITE: break; case CQ_DISPATCH_CMD_TERMINATE: break; case CQ_DISPATCH_CMD_SET_WRITE_OFFSET: break; default: TT_THROW("Unrecognized dispatch command: {}", cmd_id); break; @@ -249,13 +248,6 @@ uint32_t dump_prefetch_cmd(CQPrefetchCmd *cmd, uint32_t cmd_addr, std::ofstream val(cmd->debug.stride)); stride = cmd->debug.stride; break; - case CQ_PREFETCH_CMD_WAIT_FOR_EVENT: - iq_file << fmt::format( - " (sync_event={:#08x}, sync_event_addr={:#08x})", - val(cmd->event_wait.sync_event), - val(cmd->event_wait.sync_event_addr)); - stride = CQ_PREFETCH_CMD_BARE_MIN_SIZE + sizeof(CQPrefetchHToPrefetchDHeader); - break; // These commands don't have any additional data to dump. case CQ_PREFETCH_CMD_ILLEGAL: break; case CQ_PREFETCH_CMD_STALL: break; diff --git a/tt_metal/impl/dispatch/device_command.hpp b/tt_metal/impl/dispatch/device_command.hpp index 7d2b14fdaa0..7ad8aab48b6 100644 --- a/tt_metal/impl/dispatch/device_command.hpp +++ b/tt_metal/impl/dispatch/device_command.hpp @@ -129,48 +129,6 @@ class DeviceCommand { } } - void add_prefetch_wait_for_event(uint32_t event_id, uint32_t event_addr) { - uint32_t increment_sizeB = align(sizeof(CQPrefetchCmd), this->pcie_alignment); - auto initialize_wait_cmd = [&](CQPrefetchCmd *wait_cmd) { - *wait_cmd = {}; - wait_cmd->base.cmd_id = CQ_PREFETCH_CMD_WAIT_FOR_EVENT; - wait_cmd->event_wait.sync_event = event_id; - wait_cmd->event_wait.sync_event_addr = event_addr; - }; - CQPrefetchCmd *wait_cmd_dst = this->reserve_space(increment_sizeB); - if constexpr (hugepage_write) { - alignas(MEMCPY_ALIGNMENT) CQPrefetchCmd wait_cmd; - initialize_wait_cmd(&wait_cmd); - this->memcpy(wait_cmd_dst, &wait_cmd, sizeof(CQPrefetchCmd)); - } else { - initialize_wait_cmd(wait_cmd_dst); - } - } - - void add_dispatch_write_remote(uint32_t data, uint32_t noc_xy_addr, uint32_t addr) { - auto initialize_cross_prefetch_write = [&](CQPrefetchCmd *relay_write, CQDispatchCmd *write_cmd) { - relay_write->base.cmd_id = CQ_PREFETCH_CMD_RELAY_INLINE; - relay_write->relay_inline.length = sizeof(CQDispatchCmd); - relay_write->relay_inline.stride = CQ_PREFETCH_CMD_BARE_MIN_SIZE; - write_cmd->base.cmd_id = CQ_DISPATCH_CMD_REMOTE_WRITE; - write_cmd->write_from_remote.data = data; - write_cmd->write_from_remote.noc_xy_addr = noc_xy_addr; - write_cmd->write_from_remote.addr = addr; - }; - CQPrefetchCmd *relay_write_dst = this->reserve_space(sizeof(CQPrefetchCmd)); - CQDispatchCmd *write_cmd_dst = this->reserve_space(sizeof(CQDispatchCmd)); - - if constexpr (hugepage_write) { - alignas(MEMCPY_ALIGNMENT) CQPrefetchCmd relay_write; - alignas(MEMCPY_ALIGNMENT) CQDispatchCmd write_cmd; - initialize_cross_prefetch_write(&relay_write, &write_cmd); - this->memcpy(relay_write_dst, &relay_write, sizeof(CQPrefetchCmd)); - this->memcpy(write_cmd_dst, &write_cmd, sizeof(CQDispatchCmd)); - } else { - initialize_cross_prefetch_write(write_cmd_dst); - } - } - void add_prefetch_relay_linear(uint32_t noc_xy_addr, uint32_t lengthB, uint32_t addr) { uint32_t increment_sizeB = align(sizeof(CQPrefetchCmd), this->pcie_alignment); auto initialize_relay_linear_cmd = [&](CQPrefetchCmd *relay_linear_cmd) { diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index e939d055eac..2b80765c827 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -218,12 +218,6 @@ void process_write_host_h(uint32_t& block_noc_writes_to_clear, uint32_t block_ne cmd_ptr = data_ptr; } -inline void process_remote_write_h() { - volatile tt_l1_ptr CQDispatchCmd *cmd = (volatile tt_l1_ptr CQDispatchCmd *)cmd_ptr; - noc_inline_dw_write(get_noc_addr_helper(cmd->write_from_remote.noc_xy_addr, cmd->write_from_remote.addr), cmd->write_from_remote.data); - cmd_ptr += sizeof(CQDispatchCmd); -} - void process_exec_buf_end_h() { if (split_prefetch) { invalidate_l1_cache(); @@ -884,15 +878,6 @@ static inline bool process_cmd_d(uint32_t &cmd_ptr, uint32_t* l1_cache, uint32_t } break; - case CQ_DISPATCH_CMD_REMOTE_WRITE: - DPRINT << "cmd_remote_write\n"; - if (is_d_variant && !is_h_variant) { - // Relay write to dispatch_h, which will issue it on local chip - relay_to_next_cb(cmd_ptr, sizeof(CQDispatchCmd), block_noc_writes_to_clear, block_next_start_addr); - } - cmd_ptr += sizeof(CQDispatchCmd); - break; - case CQ_DISPATCH_CMD_SET_WRITE_OFFSET: DPRINT << "write offset: " << cmd->set_write_offset.offset0 << " " << @@ -948,10 +933,6 @@ static inline bool process_cmd_h(uint32_t &cmd_ptr, uint32_t& block_noc_writes_t DPRINT << "dispatch_h exec_buf_end\n"; process_exec_buf_end_h(); break; - case CQ_DISPATCH_CMD_REMOTE_WRITE: - DPRINT << "cmd_remote_write\n"; - process_remote_write_h(); - break; case CQ_DISPATCH_CMD_TERMINATE: DPRINT << "dispatch_h terminate\n"; cmd_ptr += sizeof(CQDispatchCmd); diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index a647cda3ad9..a2fd7a25789 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -531,16 +531,6 @@ uint32_t process_relay_paged_cmd_large(uint32_t cmd_ptr, return CQ_PREFETCH_CMD_BARE_MIN_SIZE; } -inline uint32_t process_prefetch_h_wait_cmd(uint32_t cmd_ptr) { - volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)(cmd_ptr + sizeof(CQPrefetchHToPrefetchDHeader)); - volatile tt_l1_ptr uint32_t* event_addr = - reinterpret_cast(cmd->event_wait.sync_event_addr); - do { - invalidate_l1_cache(); - } while (*event_addr < cmd->event_wait.sync_event); - return CQ_PREFETCH_CMD_BARE_MIN_SIZE + sizeof(CQPrefetchHToPrefetchDHeader); -} - // This fn prefetches data from DRAM memory and writes data to the dispatch core. // Reading from DRAM has the following characteristics: // - latency is moderately high ~400 cycles on WH @@ -1181,10 +1171,6 @@ bool process_cmd(uint32_t& cmd_ptr, stride = process_debug_cmd(cmd_ptr); break; - case CQ_PREFETCH_CMD_WAIT_FOR_EVENT: - stride = CQ_PREFETCH_CMD_BARE_MIN_SIZE; - break; - case CQ_PREFETCH_CMD_TERMINATE: //DPRINT << "prefetch terminating_" << is_h_variant << is_d_variant << ENDL(); ASSERT(!exec_buf); @@ -1319,11 +1305,6 @@ void kernel_main_h() { volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)(cmd_ptr + sizeof(CQPrefetchHToPrefetchDHeader)); uint32_t cmd_id = cmd->base.cmd_id; - if (cmd_id == CQ_PREFETCH_CMD_WAIT_FOR_EVENT) { - // prefetch_h will stop execution until it recieves an event update from the - // dispatch_d core assigned to the other CQ - uint32_t stride = process_prefetch_h_wait_cmd(cmd_ptr); - } // Infer that an exec_buf command is to be executed based on the stall state. bool is_exec_buf = (stall_state == STALLED); cmd_ptr = process_relay_inline_all(cmd_ptr, fence, is_exec_buf);