diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 5605e0df97f5..bb00b09b0a8f 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -1029,6 +1029,12 @@ void EnqueueProgramCommand::assemble_device_commands( read_length = max_paged_length_per_sub_cmd; write_length = read_length; } + if (!kernel_bins_dispatch_subcmds.empty() && !kernel_bins_dispatch_subcmds.back().empty()) { + auto& back = kernel_bins_dispatch_subcmds.back().back(); + if (back.noc_xy_addr != noc_encoding) { + back.flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; + } + } kernel_bins_dispatch_subcmds.back().emplace_back(CQDispatchWritePackedLargeSubCmd{ .noc_xy_addr = noc_encoding, .addr = kernel_config_buffer_offset, @@ -1050,9 +1056,12 @@ void EnqueueProgramCommand::assemble_device_commands( } } } - // Unlink the last subcmd of the current core range - if (!write_linear) { - kernel_bins_dispatch_subcmds.back().back().flags |= CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; + } + // Unlink the last subcmd of the current core range + for (auto& subcmd_list : kernel_bins_dispatch_subcmds) { + if (!subcmd_list.empty()) { + subcmd_list.back().flags |= CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; + } } uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index c4544bba211c..57d76e7fadae 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -694,6 +694,7 @@ void process_write_packed_large( CQDispatchWritePackedLargeSubCmd* sub_cmd_ptr = (CQDispatchWritePackedLargeSubCmd*)l1_cache; bool init_state = true; + bool must_barrier = true; while (count != 0) { uint32_t dst_addr = sub_cmd_ptr->addr + local_write_offset; uint32_t length = sub_cmd_ptr->length; @@ -706,7 +707,8 @@ void process_write_packed_large( mcasts += num_dests * writes; noc_nonposted_writes_acked[noc_index] = mcasts; writes = 0; - noc_async_write_barrier(); + if (must_barrier) + noc_async_write_barrier(); }; // Only re-init state after we have unlinked the last transaction @@ -717,8 +719,10 @@ void process_write_packed_large( uint32_t dst_noc = sub_cmd_ptr->noc_xy_addr; // TODO: Linking should be set to true once atomic txn is handled properly cq_noc_async_write_init_state(0, get_noc_addr_helper(dst_noc, dst_addr)); + must_barrier = true; } + sub_cmd_ptr++; while (length != 0) { @@ -751,6 +755,7 @@ void process_write_packed_large( xfer_size = available_data; wait_for_barrier(); cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); + must_barrier = false; writes++; } else { xfer_size = length; @@ -761,6 +766,7 @@ void process_write_packed_large( uint32_t dst_addr2 = dst_addr; wait_for_barrier(); cq_noc_async_write_with_state(src_addr, dst_addr2, NOC_MAX_BURST_SIZE, num_dests); + must_barrier = false; writes++; src_addr += NOC_MAX_BURST_SIZE; dst_addr2 += NOC_MAX_BURST_SIZE; @@ -782,9 +788,12 @@ void process_write_packed_large( cq_noc_async_write_with_state( data_ptr + data_offset, dst_addr + data_offset, rem_xfer_size, num_dests); writes++; + must_barrier = true; + // Later writes must barrier. } else { wait_for_barrier(); cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); + must_barrier = false; writes++; } }