From 38217c0cad5570044d48d9dc7fce91ffbf18d732 Mon Sep 17 00:00:00 2001 From: asaigal Date: Thu, 25 Jul 2024 02:23:49 +0000 Subject: [PATCH] #11092: Remove need for orphan writes in dispatch write fns --- tt_metal/impl/dispatch/kernels/cq_common.hpp | 13 ++ .../impl/dispatch/kernels/cq_dispatch.cpp | 123 ++++-------------- 2 files changed, 39 insertions(+), 97 deletions(-) diff --git a/tt_metal/impl/dispatch/kernels/cq_common.hpp b/tt_metal/impl/dispatch/kernels/cq_common.hpp index 44d60a0712e3..79d96b9b5fcf 100644 --- a/tt_metal/impl/dispatch/kernels/cq_common.hpp +++ b/tt_metal/impl/dispatch/kernels/cq_common.hpp @@ -110,6 +110,19 @@ void cq_noc_async_write_with_state(uint32_t src_addr, uint64_t dst_addr, uint32_ } } +// More generic version of cq_noc_async_write_with_state: Allows writing an abitrary amount of data, when the NOC config (dst_noc, +// VC..) have been specified. +FORCE_INLINE +void cq_noc_async_write_with_state_any_len(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0, uint32_t ndests = 1) { + while(size > NOC_MAX_BURST_SIZE) { + cq_noc_async_write_with_state(src_addr, dst_addr, NOC_MAX_BURST_SIZE, ndests); + src_addr += NOC_MAX_BURST_SIZE; + dst_addr += NOC_MAX_BURST_SIZE; + size -= NOC_MAX_BURST_SIZE; + } + cq_noc_async_write_with_state(src_addr, dst_addr, size, ndests); +} + template FORCE_INLINE void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0) { diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 4274ef46293d..851aa96d5dbd 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -384,63 +384,38 @@ void process_write_linear(uint32_t num_mcast_dests) { cq_noc_async_write_init_state(0, get_noc_addr_helper(dst_noc, dst_addr)); while (length != 0) { - uint32_t xfer_size = (length > dispatch_cb_page_size) ? dispatch_cb_page_size : length; - // "Reserve" pages for the next write from this block - block_noc_writes_to_clear[rd_block_idx]++; - // Get a page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + if (cb_fence == data_ptr) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - uint32_t orphan_size = cb_fence - data_ptr; - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - block_noc_writes_to_clear[rd_block_idx] -= (orphan_size == 0); - - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - if (orphan_size != 0) { - if constexpr (multicast) { - cq_noc_async_write_with_state( - data_ptr, dst_addr, orphan_size, num_mcast_dests); - } else { - cq_noc_async_write_with_state(data_ptr, dst_addr, orphan_size); - } - noc_nonposted_writes_num_issued[noc_index]++; - noc_nonposted_writes_acked[noc_index] += num_mcast_dests; - length -= orphan_size; - xfer_size -= orphan_size; - dst_addr += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; } move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - block_noc_writes_to_clear[rd_block_idx] += (orphan_size == 0); } - // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); + cb_fence += n_pages * dispatch_cb_page_size; - // Release pages for prefetcher - // Since we gate how much we acquire to < 1/4 the buffer, this should be called enough cb_block_release_pages< upstream_noc_xy, upstream_dispatch_cb_sem_id, dispatch_cb_blocks, dispatch_cb_pages_per_block>(block_noc_writes_to_clear, wr_block_idx); } + uint32_t available_data = cb_fence - data_ptr; + uint32_t xfer_size = length > available_data ? available_data : length; + uint64_t dst = get_noc_addr_helper(dst_noc, dst_addr); if constexpr (multicast) { - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size, num_mcast_dests); + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_mcast_dests); } else { - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size); + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size); } - noc_nonposted_writes_num_issued[noc_index]++; - noc_nonposted_writes_acked[noc_index] += num_mcast_dests; + uint32_t num_noc_packets_written = (xfer_size + NOC_MAX_BURST_SIZE - 1) / NOC_MAX_BURST_SIZE; + noc_nonposted_writes_num_issued[noc_index] += num_noc_packets_written; + noc_nonposted_writes_acked[noc_index] += num_mcast_dests * num_noc_packets_written; + block_noc_writes_to_clear[rd_block_idx] += num_noc_packets_written; length -= xfer_size; data_ptr += xfer_size; dst_addr += xfer_size; @@ -480,55 +455,32 @@ void process_write_paged() { while (write_length != 0) { // TODO #7360: Have more performant handling when page_size > dispatch_cb_page_size by not doing multiple writes // for one buffer page - uint32_t xfer_size = - page_size > dispatch_cb_page_size ? min(dispatch_cb_page_size, page_size - dst_addr_offset) : page_size; - uint64_t dst = addr_gen.get_noc_addr( - page_id, dst_addr_offset); - // "Reserve" pages for the next write from this block - block_noc_writes_to_clear[rd_block_idx]++; - // Get a Dispatch page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + if (cb_fence == data_ptr) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - uint32_t orphan_size = cb_fence - data_ptr; - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - block_noc_writes_to_clear[rd_block_idx] -= (orphan_size == 0); - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - if (orphan_size != 0) { - noc_async_write(data_ptr, dst, orphan_size); - write_length -= orphan_size; - xfer_size -= orphan_size; - dst_addr_offset += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; - dst = addr_gen.get_noc_addr(page_id, dst_addr_offset); } move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - block_noc_writes_to_clear[rd_block_idx] += (orphan_size == 0); } - - // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); + cb_fence += n_pages * dispatch_cb_page_size; - // Release pages for prefetcher - // Since we gate how much we acquire to < 1/4 the buffer, this should be called enough cb_block_release_pages< upstream_noc_xy, upstream_dispatch_cb_sem_id, dispatch_cb_blocks, dispatch_cb_pages_per_block>(block_noc_writes_to_clear, wr_block_idx); } + uint32_t available_data = cb_fence - data_ptr; + uint32_t xfer_size = (page_size - dst_addr_offset) > available_data ? available_data : (page_size - dst_addr_offset); + uint64_t dst = addr_gen.get_noc_addr( + page_id, dst_addr_offset); - noc_async_write(data_ptr, dst, xfer_size); - + noc_async_write(data_ptr, dst, xfer_size); + block_noc_writes_to_clear[rd_block_idx] += (xfer_size + NOC_MAX_BURST_SIZE - 1) / NOC_MAX_BURST_SIZE; // If paged write is not completed for a page (dispatch_cb_page_size < page_size) then add offset, otherwise // incr page_id. if (dst_addr_offset + xfer_size < page_size) { @@ -713,51 +665,28 @@ void process_write_packed_large() { cq_noc_async_write_with_state(0, get_noc_addr_helper(dst_noc, dst_addr)); while (length != 0) { - uint32_t xfer_size = (length > dispatch_cb_page_size) ? dispatch_cb_page_size : length; - // "Reserve" pages for the next write from this block - writes++; - mcasts += num_dests; - // Get a page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + if (data_ptr == cb_fence) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - uint32_t orphan_size = cb_fence - data_ptr; - writes -= (orphan_size == 0); - mcasts -= (orphan_size == 0) * num_dests; - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - ASSERT(cb_fence == dispatch_cb_end); - if (orphan_size != 0) { - cq_noc_async_write_with_state(data_ptr, dst_addr, orphan_size, num_dests); - length -= orphan_size; - xfer_size -= orphan_size; - dst_addr += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; } - block_noc_writes_to_clear[rd_block_idx] += writes; noc_nonposted_writes_num_issued[noc_index] += writes; writes = 0; move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - writes += (orphan_size == 0); - mcasts += (orphan_size == 0) * num_dests; } - - // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); cb_fence += n_pages * dispatch_cb_page_size; } - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size, num_dests); - + uint32_t available_data = cb_fence - data_ptr; + uint32_t xfer_size = (length > available_data) ? available_data : length; + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); + uint32_t num_noc_packets_written = (xfer_size + NOC_MAX_BURST_SIZE - 1) / NOC_MAX_BURST_SIZE; + writes += num_noc_packets_written; + mcasts += num_dests * num_noc_packets_written; length -= xfer_size; data_ptr += xfer_size; dst_addr += xfer_size;