Skip to content

Commit

Permalink
#0: Add compile time function for automatically selecting the most op…
Browse files Browse the repository at this point in the history
…timized interleaved addr gen
  • Loading branch information
tt-aho committed Dec 21, 2024
1 parent 2391f3b commit 538fe9b
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 32 deletions.
97 changes: 68 additions & 29 deletions tt_metal/hw/inc/dataflow_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -713,34 +713,6 @@ std::uint64_t get_noc_addr(std::uint32_t addr, uint8_t noc = noc_index) {
return NOC_XY_ADDR(my_x[noc], my_y[noc], addr);
}

/**
* Initiates an asynchronous read from a specified source node located at NOC
* coordinates (x,y) at a local address (encoded as a uint64_t using \a
* get_noc_addr function). The destination is in L1 memory on the Tensix core
* executing this function call. Also, see \a noc_async_read_barrier.
*
* The source node can be either a DRAM bank, a Tensix core or a PCIe controller.
*
* Return value: None
*
* | Argument | Description | Data type | Valid range | required |
* |-------------------|----------------------------------------------------|-----------|------------------------------------------|----------|
* | src_noc_addr | Encoding of the source NOC location (x,y)+address | uint64_t | DOX-TODO(ref to explain valid
* coords) | Yes | | dst_local_l1_addr | Address in local L1 memory | uint32_t | 0..1MB
* | Yes | | size | Size of data transfer in bytes | uint32_t | 0..1MB | Yes |
*/
inline void noc_async_read(
std::uint64_t src_noc_addr, std::uint32_t dst_local_l1_addr, std::uint32_t size, uint8_t noc = noc_index) {
/*
Read requests - use static VC
Read responses - assigned VCs dynamically
*/
WAYPOINT("NARW");
DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, src_noc_addr, dst_local_l1_addr, size);
ncrisc_noc_fast_read_any_len(noc, read_cmd_buf, src_noc_addr, dst_local_l1_addr, size);
WAYPOINT("NARD");
}

// TODO: write docs
// this issues only a single packet with size <= NOC_MAX_BURST_SIZE (ie maximum packet size)
FORCE_INLINE
Expand Down Expand Up @@ -776,6 +748,39 @@ void noc_async_read_one_packet(
WAYPOINT("NAOD");
}

/**
* Initiates an asynchronous read from a specified source node located at NOC
* coordinates (x,y) at a local address (encoded as a uint64_t using \a
* get_noc_addr function). The destination is in L1 memory on the Tensix core
* executing this function call. Also, see \a noc_async_read_barrier.
*
* The source node can be either a DRAM bank, a Tensix core or a PCIe controller.
*
* Return value: None
*
* | Argument | Description | Data type | Valid range | required |
* |-------------------|----------------------------------------------------|-----------|------------------------------------------|----------|
* | src_noc_addr | Encoding of the source NOC location (x,y)+address | uint64_t | DOX-TODO(ref to explain valid
* coords) | Yes | | dst_local_l1_addr | Address in local L1 memory | uint32_t | 0..1MB
* | Yes | | size | Size of data transfer in bytes | uint32_t | 0..1MB | Yes |
*/
template <uint32_t max_page_size = NOC_MAX_BURST_SIZE + 1>
inline void noc_async_read(
std::uint64_t src_noc_addr, std::uint32_t dst_local_l1_addr, std::uint32_t size, uint8_t noc = noc_index) {
/*
Read requests - use static VC
Read responses - assigned VCs dynamically
*/
if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) {
noc_async_read_one_packet(src_noc_addr, dst_local_l1_addr, size, noc);
} else {
WAYPOINT("NARW");
DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, src_noc_addr, dst_local_l1_addr, size);
ncrisc_noc_fast_read_any_len(noc, read_cmd_buf, src_noc_addr, dst_local_l1_addr, size);
WAYPOINT("NARD");
}
}

// TODO: write docs
// this issues only a single packet with size <= NOC_MAX_BURST_SIZE (ie maximum packet size)
FORCE_INLINE
Expand Down Expand Up @@ -1364,6 +1369,23 @@ FORCE_INLINE std::uint64_t get_noc_addr(
return s.get_noc_addr(id, offset, noc);
}

template <bool DRAM>
FORCE_INLINE std::uint64_t get_noc_addr(
const uint32_t id, const InterleavedPow2AddrGenFast<DRAM>& s, uint32_t offset = 0, uint8_t noc = noc_index) {
/*
Alternative API for getting the noc address when we are reading using a swizzled
layout. This version assumes bank unit size is a power of 2 and less than or equal to NOC_MAX_BURST_SIZE.
For arbitrary bank unit size, use get_noc_addr(const uint32_t id, const InterleavedOffset s)
id: Unique id for the bank_unit you want to read, assuming row major order. We use this to compute the
bank for this unit of data.
InterleavedPow2AddrGenFast: Check struct for attribute definitions.
*/

return s.get_noc_addr(id, offset, noc);
}

template <bool DRAM>
FORCE_INLINE void noc_async_read_page(
const uint32_t id,
Expand Down Expand Up @@ -1415,7 +1437,7 @@ template <uint32_t max_page_size = NOC_MAX_BURST_SIZE + 1>
inline void noc_async_write(
std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, std::uint32_t size, uint8_t noc = noc_index) {
if constexpr (max_page_size <= NOC_MAX_BURST_SIZE) {
noc_async_write_one_packet(src_local_l1_addr, dst_noc_addr, size);
noc_async_write_one_packet(src_local_l1_addr, dst_noc_addr, size, noc);
} else {
WAYPOINT("NAWW");
DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size);
Expand Down Expand Up @@ -2015,3 +2037,20 @@ uint64_t get_noc_addr_from_bank_id(uint32_t bank_id, uint32_t bank_address_offse
}
return (noc_addr << NOC_ADDR_COORD_SHIFT) | (bank_address_offset);
}

template <bool DRAM, uint32_t page_size>
FORCE_INLINE auto get_interleaved_addr_gen(uint32_t base_addr) {
constexpr bool is_pow_2 = is_power_of_2(page_size);
if constexpr (is_pow_2) {
constexpr uint32_t log2_page_size = __builtin_ctz(page_size);
if constexpr (page_size <= NOC_MAX_BURST_SIZE) {
return InterleavedPow2AddrGenFast<DRAM>{
.bank_base_address = base_addr, .log_base_2_of_page_size = log2_page_size};
} else {
return InterleavedPow2AddrGen<DRAM>{
.bank_base_address = base_addr, .log_base_2_of_page_size = log2_page_size};
}
} else {
return InterleavedAddrGen<DRAM>{.bank_base_address = base_addr, .page_size = page_size};
}
}
4 changes: 3 additions & 1 deletion tt_metal/hw/inc/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#pragma once

inline __attribute__((always_inline)) uint32_t align(uint32_t addr, uint32_t alignment) {
inline __attribute__((always_inline)) constexpr uint32_t align(uint32_t addr, uint32_t alignment) {
return ((addr - 1) | (alignment - 1)) + 1;
}

inline __attribute__((always_inline)) constexpr bool is_power_of_2(uint32_t n) { return (n & (n - 1)) == 0; }
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ FORCE_INLINE void enhanced_noc_async_read(
const uint64_t src_noc_addr, const uint32_t dst_l1_addr, const uint32_t bytes) {
// If you do not know the max_transfer_size at compile time write 0 to it.
// only reads is true if we ONLY use noc_async_read and all calls to tt_memmove have use_read_datamover as True
if constexpr (((max_transfer_size < NOC_MAX_BURST_SIZE) && (max_transfer_size != 0)) || only_reads) {
if constexpr (only_reads) {
noc_async_read_one_packet(src_noc_addr, dst_l1_addr, bytes);
} else {
noc_async_read(src_noc_addr, dst_l1_addr, bytes);
noc_async_read<max_transfer_size == 0 ? NOC_MAX_BURST_SIZE + 1 : max_transfer_size>(
src_noc_addr, dst_l1_addr, bytes);
}
}

Expand Down

0 comments on commit 538fe9b

Please sign in to comment.