From 907be2429ee13879434ad0f2b8c219218b16fb14 Mon Sep 17 00:00:00 2001 From: Aditya Saigal <129097327+tt-asaigal@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:35:30 -0400 Subject: [PATCH] #11092: Remove need for orphan writes in dispatch write functions (#11093) - process_write_linear, process_write_paged and process_write_packed_large will now write up to the fence --- tt_metal/impl/dispatch/kernels/cq_common.hpp | 18 +++ .../impl/dispatch/kernels/cq_dispatch.cpp | 139 ++++++------------ 2 files changed, 60 insertions(+), 97 deletions(-) diff --git a/tt_metal/impl/dispatch/kernels/cq_common.hpp b/tt_metal/impl/dispatch/kernels/cq_common.hpp index 44d60a0712e..6b0227f4eae 100644 --- a/tt_metal/impl/dispatch/kernels/cq_common.hpp +++ b/tt_metal/impl/dispatch/kernels/cq_common.hpp @@ -15,6 +15,11 @@ uint32_t round_up_pow2(uint32_t v, uint32_t pow2_size) { return (v + (pow2_size - 1)) & ~(pow2_size - 1); } +FORCE_INLINE +uint32_t div_up(uint32_t n, uint32_t d) { + return (n + d - 1) / d; +} + FORCE_INLINE uint32_t wrap_ge(uint32_t a, uint32_t b) { @@ -110,6 +115,19 @@ void cq_noc_async_write_with_state(uint32_t src_addr, uint64_t dst_addr, uint32_ } } +// More generic version of cq_noc_async_write_with_state: Allows writing an abitrary amount of data, when the NOC config (dst_noc, +// VC..) have been specified. +FORCE_INLINE +void cq_noc_async_write_with_state_any_len(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0, uint32_t ndests = 1) { + while(size > NOC_MAX_BURST_SIZE) { + cq_noc_async_write_with_state(src_addr, dst_addr, NOC_MAX_BURST_SIZE, ndests); + src_addr += NOC_MAX_BURST_SIZE; + dst_addr += NOC_MAX_BURST_SIZE; + size -= NOC_MAX_BURST_SIZE; + } + cq_noc_async_write_with_state(src_addr, dst_addr, size, ndests); +} + template FORCE_INLINE void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0) { diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 4274ef46293..351d88ae1e1 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -206,16 +206,14 @@ void process_write_host_h() { length -= last_chunk_size; xfer_size -= last_chunk_size; host_completion_queue_write_addr = pcie_noc_xy | completion_queue_write_addr; - block_noc_writes_to_clear[rd_block_idx]+=(last_chunk_size + NOC_MAX_BURST_SIZE - 1) / NOC_MAX_BURST_SIZE; + block_noc_writes_to_clear[rd_block_idx]+= div_up(last_chunk_size, NOC_MAX_BURST_SIZE); } noc_async_write(data_ptr, host_completion_queue_write_addr, xfer_size); // This will update the write ptr on device and host // We flush to ensure the ptr has been read out of l1 before we update it again completion_queue_push_back(npages); noc_async_writes_flushed(); - block_noc_writes_to_clear[rd_block_idx] += - (xfer_size + NOC_MAX_BURST_SIZE - 1) / - NOC_MAX_BURST_SIZE; + block_noc_writes_to_clear[rd_block_idx] += div_up(xfer_size, NOC_MAX_BURST_SIZE); length -= xfer_size; data_ptr += xfer_size; @@ -384,45 +382,19 @@ void process_write_linear(uint32_t num_mcast_dests) { cq_noc_async_write_init_state(0, get_noc_addr_helper(dst_noc, dst_addr)); while (length != 0) { - uint32_t xfer_size = (length > dispatch_cb_page_size) ? dispatch_cb_page_size : length; - // "Reserve" pages for the next write from this block - block_noc_writes_to_clear[rd_block_idx]++; - // Get a page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + // More data needs to be written, but we've exhausted the CB. Acquire more pages. + if (cb_fence == data_ptr) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - uint32_t orphan_size = cb_fence - data_ptr; - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - block_noc_writes_to_clear[rd_block_idx] -= (orphan_size == 0); - - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - if (orphan_size != 0) { - if constexpr (multicast) { - cq_noc_async_write_with_state( - data_ptr, dst_addr, orphan_size, num_mcast_dests); - } else { - cq_noc_async_write_with_state(data_ptr, dst_addr, orphan_size); - } - noc_nonposted_writes_num_issued[noc_index]++; - noc_nonposted_writes_acked[noc_index] += num_mcast_dests; - length -= orphan_size; - xfer_size -= orphan_size; - dst_addr += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; } move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - block_noc_writes_to_clear[rd_block_idx] += (orphan_size == 0); } // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); + cb_fence += n_pages * dispatch_cb_page_size; // Release pages for prefetcher @@ -433,14 +405,20 @@ void process_write_linear(uint32_t num_mcast_dests) { dispatch_cb_blocks, dispatch_cb_pages_per_block>(block_noc_writes_to_clear, wr_block_idx); } + // Transfer size is min(remaining_length, data_available_in_cb) + uint32_t available_data = cb_fence - data_ptr; + uint32_t xfer_size = length > available_data ? available_data : length; if constexpr (multicast) { - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size, num_mcast_dests); + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_mcast_dests); } else { - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size); + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size); } - noc_nonposted_writes_num_issued[noc_index]++; - noc_nonposted_writes_acked[noc_index] += num_mcast_dests; + // Increment counters based on the number of packets that were written + uint32_t num_noc_packets_written = div_up(xfer_size, NOC_MAX_BURST_SIZE); + noc_nonposted_writes_num_issued[noc_index] += num_noc_packets_written; + noc_nonposted_writes_acked[noc_index] += num_mcast_dests * num_noc_packets_written; + block_noc_writes_to_clear[rd_block_idx] += num_noc_packets_written; length -= xfer_size; data_ptr += xfer_size; dst_addr += xfer_size; @@ -480,42 +458,19 @@ void process_write_paged() { while (write_length != 0) { // TODO #7360: Have more performant handling when page_size > dispatch_cb_page_size by not doing multiple writes // for one buffer page - uint32_t xfer_size = - page_size > dispatch_cb_page_size ? min(dispatch_cb_page_size, page_size - dst_addr_offset) : page_size; - uint64_t dst = addr_gen.get_noc_addr( - page_id, dst_addr_offset); - // "Reserve" pages for the next write from this block - block_noc_writes_to_clear[rd_block_idx]++; - // Get a Dispatch page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + // More data needs to be written, but we've exhausted the CB. Acquire more pages. + if (cb_fence == data_ptr) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - uint32_t orphan_size = cb_fence - data_ptr; - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - block_noc_writes_to_clear[rd_block_idx] -= (orphan_size == 0); - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - if (orphan_size != 0) { - noc_async_write(data_ptr, dst, orphan_size); - write_length -= orphan_size; - xfer_size -= orphan_size; - dst_addr_offset += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; - dst = addr_gen.get_noc_addr(page_id, dst_addr_offset); } move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - block_noc_writes_to_clear[rd_block_idx] += (orphan_size == 0); } - // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); + cb_fence += n_pages * dispatch_cb_page_size; // Release pages for prefetcher @@ -526,12 +481,22 @@ void process_write_paged() { dispatch_cb_blocks, dispatch_cb_pages_per_block>(block_noc_writes_to_clear, wr_block_idx); } + // Transfer size is min(remaining_length, data_available_in_cb) + uint32_t available_data = cb_fence - data_ptr; + uint32_t remaining_page_size = page_size - dst_addr_offset; + uint32_t xfer_size = remaining_page_size > available_data ? available_data : remaining_page_size; + // Cap the transfer size to the NOC packet size - use of One Packet NOC API (better performance + // than writing a generic amount of data) + xfer_size = xfer_size > NOC_MAX_BURST_SIZE ? NOC_MAX_BURST_SIZE : xfer_size; + uint64_t dst = addr_gen.get_noc_addr( + page_id, dst_addr_offset); - noc_async_write(data_ptr, dst, xfer_size); - + noc_async_write(data_ptr, dst, xfer_size); + block_noc_writes_to_clear[rd_block_idx]++; // If paged write is not completed for a page (dispatch_cb_page_size < page_size) then add offset, otherwise // incr page_id. - if (dst_addr_offset + xfer_size < page_size) { + if (xfer_size < remaining_page_size) { + // The above evaluates to: dst_addr_offset + xfer_size < page_size, but this saves a redundant calculation. dst_addr_offset += xfer_size; } else { page_id++; @@ -713,51 +678,30 @@ void process_write_packed_large() { cq_noc_async_write_with_state(0, get_noc_addr_helper(dst_noc, dst_addr)); while (length != 0) { - uint32_t xfer_size = (length > dispatch_cb_page_size) ? dispatch_cb_page_size : length; - // "Reserve" pages for the next write from this block - writes++; - mcasts += num_dests; - // Get a page if needed - if (data_ptr + xfer_size > cb_fence) { - // Check for block completion + // More data needs to be written, but we've exhausted the CB. Acquire more pages. + if (data_ptr == cb_fence) { if (cb_fence == block_next_start_addr[rd_block_idx]) { - // No more writes from this block. Decrement the number of writes - // since they were all accounted for. - uint32_t orphan_size = cb_fence - data_ptr; - writes -= (orphan_size == 0); - mcasts -= (orphan_size == 0) * num_dests; - // Check for dispatch_cb wrap if (rd_block_idx == dispatch_cb_blocks - 1) { - ASSERT(cb_fence == dispatch_cb_end); - if (orphan_size != 0) { - cq_noc_async_write_with_state(data_ptr, dst_addr, orphan_size, num_dests); - length -= orphan_size; - xfer_size -= orphan_size; - dst_addr += orphan_size; - // All writes from this block have completed. - orphan_size = 0; - } cb_fence = dispatch_cb_base; data_ptr = dispatch_cb_base; } - + // Block completion - account for all writes issued for this block before moving to next block_noc_writes_to_clear[rd_block_idx] += writes; noc_nonposted_writes_num_issued[noc_index] += writes; + mcasts += num_dests * writes; writes = 0; move_rd_to_next_block(block_noc_writes_to_clear, rd_block_idx); - // Next write will be from next block. "Reserve" pages for it. - writes += (orphan_size == 0); - mcasts += (orphan_size == 0) * num_dests; } - - // Wait for dispatcher to supply a page (this won't go beyond the buffer end) uint32_t n_pages = cb_acquire_pages( cb_fence, block_next_start_addr, rd_block_idx); cb_fence += n_pages * dispatch_cb_page_size; } - - cq_noc_async_write_with_state(data_ptr, dst_addr, xfer_size, num_dests); - + // Transfer size is min(remaining_length, data_available_in_cb) + uint32_t available_data = cb_fence - data_ptr; + uint32_t xfer_size = (length > available_data) ? available_data : length; + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); + uint32_t num_noc_packets_written = div_up(xfer_size, NOC_MAX_BURST_SIZE); + writes += num_noc_packets_written; length -= xfer_size; data_ptr += xfer_size; dst_addr += xfer_size; @@ -767,6 +711,7 @@ void process_write_packed_large() { // Releasing here requires the sub_cmds to be read into local memory above block_noc_writes_to_clear[rd_block_idx] += writes; noc_nonposted_writes_num_issued[noc_index] += writes; + mcasts += num_dests * writes; writes = 0; // Handle padded size and potential wrap