From 538fe9ba07da0de01e3dff27480d4701f91deb70 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Fri, 20 Dec 2024 07:38:48 +0000 Subject: [PATCH] #0: Add compile time function for automatically selecting the most optimized interleaved addr gen --- tt_metal/hw/inc/dataflow_api.h | 97 +++++++++++++------ tt_metal/hw/inc/utils/utils.h | 4 +- .../data_movement/common/kernels/common.hpp | 5 +- 3 files changed, 74 insertions(+), 32 deletions(-) diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 70d8bc6f5f8..dcd31fbaf81 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -713,34 +713,6 @@ std::uint64_t get_noc_addr(std::uint32_t addr, uint8_t noc = noc_index) { return NOC_XY_ADDR(my_x[noc], my_y[noc], addr); } -/** - * Initiates an asynchronous read from a specified source node located at NOC - * coordinates (x,y) at a local address (encoded as a uint64_t using \a - * get_noc_addr function). The destination is in L1 memory on the Tensix core - * executing this function call. Also, see \a noc_async_read_barrier. - * - * The source node can be either a DRAM bank, a Tensix core or a PCIe controller. - * - * Return value: None - * - * | Argument | Description | Data type | Valid range | required | - * |-------------------|----------------------------------------------------|-----------|------------------------------------------|----------| - * | src_noc_addr | Encoding of the source NOC location (x,y)+address | uint64_t | DOX-TODO(ref to explain valid - * coords) | Yes | | dst_local_l1_addr | Address in local L1 memory | uint32_t | 0..1MB - * | Yes | | size | Size of data transfer in bytes | uint32_t | 0..1MB | Yes | - */ -inline void noc_async_read( - std::uint64_t src_noc_addr, std::uint32_t dst_local_l1_addr, std::uint32_t size, uint8_t noc = noc_index) { - /* - Read requests - use static VC - Read responses - assigned VCs dynamically - */ - WAYPOINT("NARW"); - DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, src_noc_addr, dst_local_l1_addr, size); - ncrisc_noc_fast_read_any_len(noc, read_cmd_buf, src_noc_addr, dst_local_l1_addr, size); - WAYPOINT("NARD"); -} - // TODO: write docs // this issues only a single packet with size <= NOC_MAX_BURST_SIZE (ie maximum packet size) FORCE_INLINE @@ -776,6 +748,39 @@ void noc_async_read_one_packet( WAYPOINT("NAOD"); } +/** + * Initiates an asynchronous read from a specified source node located at NOC + * coordinates (x,y) at a local address (encoded as a uint64_t using \a + * get_noc_addr function). The destination is in L1 memory on the Tensix core + * executing this function call. Also, see \a noc_async_read_barrier. + * + * The source node can be either a DRAM bank, a Tensix core or a PCIe controller. + * + * Return value: None + * + * | Argument | Description | Data type | Valid range | required | + * |-------------------|----------------------------------------------------|-----------|------------------------------------------|----------| + * | src_noc_addr | Encoding of the source NOC location (x,y)+address | uint64_t | DOX-TODO(ref to explain valid + * coords) | Yes | | dst_local_l1_addr | Address in local L1 memory | uint32_t | 0..1MB + * | Yes | | size | Size of data transfer in bytes | uint32_t | 0..1MB | Yes | + */ +template +inline void noc_async_read( + std::uint64_t src_noc_addr, std::uint32_t dst_local_l1_addr, std::uint32_t size, uint8_t noc = noc_index) { + /* + Read requests - use static VC + Read responses - assigned VCs dynamically + */ + if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) { + noc_async_read_one_packet(src_noc_addr, dst_local_l1_addr, size, noc); + } else { + WAYPOINT("NARW"); + DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, src_noc_addr, dst_local_l1_addr, size); + ncrisc_noc_fast_read_any_len(noc, read_cmd_buf, src_noc_addr, dst_local_l1_addr, size); + WAYPOINT("NARD"); + } +} + // TODO: write docs // this issues only a single packet with size <= NOC_MAX_BURST_SIZE (ie maximum packet size) FORCE_INLINE @@ -1364,6 +1369,23 @@ FORCE_INLINE std::uint64_t get_noc_addr( return s.get_noc_addr(id, offset, noc); } +template +FORCE_INLINE std::uint64_t get_noc_addr( + const uint32_t id, const InterleavedPow2AddrGenFast& s, uint32_t offset = 0, uint8_t noc = noc_index) { + /* + Alternative API for getting the noc address when we are reading using a swizzled + layout. This version assumes bank unit size is a power of 2 and less than or equal to NOC_MAX_BURST_SIZE. + For arbitrary bank unit size, use get_noc_addr(const uint32_t id, const InterleavedOffset s) + + id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the + bank for this unit of data. + + InterleavedPow2AddrGenFast: Check struct for attribute definitions. + */ + + return s.get_noc_addr(id, offset, noc); +} + template FORCE_INLINE void noc_async_read_page( const uint32_t id, @@ -1415,7 +1437,7 @@ template inline void noc_async_write( std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, std::uint32_t size, uint8_t noc = noc_index) { if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) { - noc_async_write_one_packet(src_local_l1_addr, dst_noc_addr, size); + noc_async_write_one_packet(src_local_l1_addr, dst_noc_addr, size, noc); } else { WAYPOINT("NAWW"); DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size); @@ -2015,3 +2037,20 @@ uint64_t get_noc_addr_from_bank_id(uint32_t bank_id, uint32_t bank_address_offse } return (noc_addr << NOC_ADDR_COORD_SHIFT) | (bank_address_offset); } + +template +FORCE_INLINE auto get_interleaved_addr_gen(uint32_t base_addr) { + constexpr bool is_pow_2 = is_power_of_2(page_size); + if constexpr (is_pow_2) { + constexpr uint32_t log2_page_size = __builtin_ctz(page_size); + if constexpr (page_size <= NOC_MAX_BURST_SIZE) { + return InterleavedPow2AddrGenFast{ + .bank_base_address = base_addr, .log_base_2_of_page_size = log2_page_size}; + } else { + return InterleavedPow2AddrGen{ + .bank_base_address = base_addr, .log_base_2_of_page_size = log2_page_size}; + } + } else { + return InterleavedAddrGen{.bank_base_address = base_addr, .page_size = page_size}; + } +} diff --git a/tt_metal/hw/inc/utils/utils.h b/tt_metal/hw/inc/utils/utils.h index 1573aa64f8d..fed8be8748a 100644 --- a/tt_metal/hw/inc/utils/utils.h +++ b/tt_metal/hw/inc/utils/utils.h @@ -4,6 +4,8 @@ #pragma once -inline __attribute__((always_inline)) uint32_t align(uint32_t addr, uint32_t alignment) { +inline __attribute__((always_inline)) constexpr uint32_t align(uint32_t addr, uint32_t alignment) { return ((addr - 1) | (alignment - 1)) + 1; } + +inline __attribute__((always_inline)) constexpr bool is_power_of_2(uint32_t n) { return (n & (n - 1)) == 0; } diff --git a/ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp b/ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp index 34cf4e3eb3b..3ae2b66b4d1 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp @@ -20,10 +20,11 @@ FORCE_INLINE void enhanced_noc_async_read( const uint64_t src_noc_addr, const uint32_t dst_l1_addr, const uint32_t bytes) { // If you do not know the max_transfer_size at compile time write 0 to it. // only reads is true if we ONLY use noc_async_read and all calls to tt_memmove have use_read_datamover as True - if constexpr (((max_transfer_size < NOC_MAX_BURST_SIZE) && (max_transfer_size != 0)) || only_reads) { + if constexpr (only_reads) { noc_async_read_one_packet(src_noc_addr, dst_l1_addr, bytes); } else { - noc_async_read(src_noc_addr, dst_l1_addr, bytes); + noc_async_read( + src_noc_addr, dst_l1_addr, bytes); } }