From 8825f5a46d78e77049efd327bf42b26edca7f5e8 Mon Sep 17 00:00:00 2001 From: asaigal Date: Thu, 25 Jul 2024 02:23:49 +0000 Subject: [PATCH] #11092: Remove need for orphan writes in dispatch write functions - process_write_linear, process_write_paged and process_write_packed_large will now write up to the fence --- tt_metal/impl/dispatch/kernels/cq_common.hpp | 13 ++ .../impl/dispatch/kernels/cq_dispatch.cpp | 126 +++++------------- 2 files changed, 47 insertions(+), 92 deletions(-) diff --git a/tt_metal/impl/dispatch/kernels/cq_common.hpp b/tt_metal/impl/dispatch/kernels/cq_common.hpp index 44d60a0712e..79d96b9b5fc 100644 --- a/tt_metal/impl/dispatch/kernels/cq_common.hpp +++ b/tt_metal/impl/dispatch/kernels/cq_common.hpp @@ -110,6 +110,19 @@ void cq_noc_async_write_with_state(uint32_t src_addr, uint64_t dst_addr, uint32_ } } +// More generic version of cq_noc_async_write_with_state: Allows writing an abitrary amount of data, when the NOC config (dst_noc, +// VC..) have been specified. +FORCE_INLINE +void cq_noc_async_write_with_state_any_len(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0, uint32_t ndests = 1) { + while(size > NOC_MAX_BURST_SIZE) { + cq_noc_async_write_with_state(src_addr, dst_addr, NOC_MAX_BURST_SIZE, ndests); + src_addr += NOC_MAX_BURST_SIZE; + dst_addr += NOC_MAX_BURST_SIZE; + size -= NOC_MAX_BURST_SIZE; + } + cq_noc_async_write_with_state(src_addr, dst_addr, size, ndests); +} + template FORCE_INLINE void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0) { diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 4274ef46293..6b166ddf49c 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -384,45 +384,19 @@ void process_write_linear(uint32_t num_mcast_dests) { cq_noc_async_write_init_state(0, get_noc_addr_helper(dst_noc, dst_addr)); while (length != 0) { - uint32_t xfer_size = (length > dispatch_cb_page_size) ? dispatch_cb_page_size : length; - // "Reserve" pages for the next write from this block - block_noc_writes_to_clear[rd_block_idx]++; - // Get a page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + // More data needs to be written, but we've exhausted the CB. Acquire more pages. + if (cb_fence == data_ptr) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - uint32_t orphan_size = cb_fence - data_ptr; - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - block_noc_writes_to_clear[rd_block_idx] -= (orphan_size == 0); - - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - if (orphan_size != 0) { - if constexpr (multicast) { - cq_noc_async_write_with_state( - data_ptr, dst_addr, orphan_size, num_mcast_dests); - } else { - cq_noc_async_write_with_state(data_ptr, dst_addr, orphan_size); - } - noc_nonposted_writes_num_issued[noc_index]++; - noc_nonposted_writes_acked[noc_index] += num_mcast_dests; - length -= orphan_size; - xfer_size -= orphan_size; - dst_addr += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; } move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - block_noc_writes_to_clear[rd_block_idx] += (orphan_size == 0); } // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); + cb_fence += n_pages * dispatch_cb_page_size; // Release pages for prefetcher @@ -433,14 +407,20 @@ void process_write_linear(uint32_t num_mcast_dests) { dispatch_cb_blocks, dispatch_cb_pages_per_block>(block_noc_writes_to_clear, wr_block_idx); } + // Transfer size is min(remaining_length, data_available_in_cb) + uint32_t available_data = cb_fence - data_ptr; + uint32_t xfer_size = length > available_data ? available_data : length; if constexpr (multicast) { - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size, num_mcast_dests); + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_mcast_dests); } else { - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size); + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size); } - noc_nonposted_writes_num_issued[noc_index]++; - noc_nonposted_writes_acked[noc_index] += num_mcast_dests; + // Increment counters based on the number of packets that were written + uint32_t num_noc_packets_written = (xfer_size + NOC_MAX_BURST_SIZE - 1) / NOC_MAX_BURST_SIZE; + noc_nonposted_writes_num_issued[noc_index] += num_noc_packets_written; + noc_nonposted_writes_acked[noc_index] += num_mcast_dests * num_noc_packets_written; + block_noc_writes_to_clear[rd_block_idx] += num_noc_packets_written; length -= xfer_size; data_ptr += xfer_size; dst_addr += xfer_size; @@ -480,42 +460,19 @@ void process_write_paged() { while (write_length != 0) { // TODO #7360: Have more performant handling when page_size > dispatch_cb_page_size by not doing multiple writes // for one buffer page - uint32_t xfer_size = - page_size > dispatch_cb_page_size ? min(dispatch_cb_page_size, page_size - dst_addr_offset) : page_size; - uint64_t dst = addr_gen.get_noc_addr( - page_id, dst_addr_offset); - // "Reserve" pages for the next write from this block - block_noc_writes_to_clear[rd_block_idx]++; - // Get a Dispatch page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + // More data needs to be written, but we've exhausted the CB. Acquire more pages. + if (cb_fence == data_ptr) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - uint32_t orphan_size = cb_fence - data_ptr; - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - block_noc_writes_to_clear[rd_block_idx] -= (orphan_size == 0); - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - if (orphan_size != 0) { - noc_async_write(data_ptr, dst, orphan_size); - write_length -= orphan_size; - xfer_size -= orphan_size; - dst_addr_offset += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; - dst = addr_gen.get_noc_addr(page_id, dst_addr_offset); } move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - block_noc_writes_to_clear[rd_block_idx] += (orphan_size == 0); } - // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); + cb_fence += n_pages * dispatch_cb_page_size; // Release pages for prefetcher @@ -526,9 +483,15 @@ void process_write_paged() { dispatch_cb_blocks, dispatch_cb_pages_per_block>(block_noc_writes_to_clear, wr_block_idx); } + // Transfer size is min(remaining_length, data_available_in_cb) + uint32_t available_data = cb_fence - data_ptr; + uint32_t xfer_size = (page_size - dst_addr_offset) > available_data ? available_data : (page_size - dst_addr_offset); + uint64_t dst = addr_gen.get_noc_addr( + page_id, dst_addr_offset); - noc_async_write(data_ptr, dst, xfer_size); - + noc_async_write(data_ptr, dst, xfer_size); + // Increment counters based on the number of packets that were written + block_noc_writes_to_clear[rd_block_idx] += (xfer_size + NOC_MAX_BURST_SIZE - 1) / NOC_MAX_BURST_SIZE; // If paged write is not completed for a page (dispatch_cb_page_size < page_size) then add offset, otherwise // incr page_id. if (dst_addr_offset + xfer_size < page_size) { @@ -713,51 +676,30 @@ void process_write_packed_large() { cq_noc_async_write_with_state(0, get_noc_addr_helper(dst_noc, dst_addr)); while (length != 0) { - uint32_t xfer_size = (length > dispatch_cb_page_size) ? dispatch_cb_page_size : length; - // "Reserve" pages for the next write from this block - writes++; - mcasts += num_dests; - // Get a page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + // More data needs to be written, but we've exhausted the CB. Acquire more pages. + if (data_ptr == cb_fence) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - uint32_t orphan_size = cb_fence - data_ptr; - writes -= (orphan_size == 0); - mcasts -= (orphan_size == 0) * num_dests; - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - ASSERT(cb_fence == dispatch_cb_end); - if (orphan_size != 0) { - cq_noc_async_write_with_state(data_ptr, dst_addr, orphan_size, num_dests); - length -= orphan_size; - xfer_size -= orphan_size; - dst_addr += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; } - + // Block completion - account for all writes issued for this block before moving to next block_noc_writes_to_clear[rd_block_idx] += writes; noc_nonposted_writes_num_issued[noc_index] += writes; writes = 0; move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - writes += (orphan_size == 0); - mcasts += (orphan_size == 0) * num_dests; } - - // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); cb_fence += n_pages * dispatch_cb_page_size; } - - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size, num_dests); - + // Transfer size is min(remaining_length, data_available_in_cb) + uint32_t available_data = cb_fence - data_ptr; + uint32_t xfer_size = (length > available_data) ? available_data : length; + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); + uint32_t num_noc_packets_written = (xfer_size + NOC_MAX_BURST_SIZE - 1) / NOC_MAX_BURST_SIZE; + writes += num_noc_packets_written; + mcasts += num_dests * num_noc_packets_written; length -= xfer_size; data_ptr += xfer_size; dst_addr += xfer_size;