diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 320b779d936..5554f2edcf3 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -20,7 +20,6 @@ #include "tools/profiler/kernel_profiler.hpp" #include "dev_msgs.h" #include "risc_attribs.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "circular_buffer.h" #include "circular_buffer_init.h" #include "dataflow_api.h" @@ -67,6 +66,13 @@ uint32_t tt_l1_ptr *rta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *crta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used)); +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + #define MEM_MOVER_VIEW_IRAM_BASE_ADDR (0x4 << 12) #if defined(PROFILE_KERNEL) @@ -343,6 +349,8 @@ int main() { do_crt1((uint32_t*)MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH); + noc_bank_table_init(MEM_BANK_TO_NOC_SCRATCH); + mailboxes->launch_msg_rd_ptr = 0; // Initialize the rdptr to 0 noc_index = 0; risc_init(); diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index dcf1ffc60a7..44d760a069c 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -5,7 +5,6 @@ #include "ethernet/dataflow_api.h" #include "ethernet/tunneling.h" #include "firmware_common.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "noc_parameters.h" #include "risc_attribs.h" #include "tools/profiler/kernel_profiler.hpp" @@ -34,6 +33,13 @@ uint32_t tt_l1_ptr *rta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *crta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used)); +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + void __attribute__((noinline)) Application(void) { WAYPOINT("I"); @@ -43,6 +49,8 @@ void __attribute__((noinline)) Application(void) { rtos_context_switch_ptr = (void (*)())RtosTable[0]; + noc_bank_table_init(eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH); + risc_init(); noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR); diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index 4e027e0dd7f..455629e95c7 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -19,7 +19,6 @@ #include "tools/profiler/kernel_profiler.hpp" #include "dev_msgs.h" #include "risc_attribs.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "circular_buffer.h" #include "dataflow_api.h" @@ -42,6 +41,13 @@ uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used uint8_t my_x[NUM_NOCS] __attribute__((used)); uint8_t my_y[NUM_NOCS] __attribute__((used)); +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + //c_tensix_core core; tt_l1_ptr mailboxes_t * const mailboxes = (tt_l1_ptr mailboxes_t *)(MEM_IERISC_MAILBOX_BASE); @@ -101,6 +107,8 @@ int main() { do_crt1((uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH); uint32_t heartbeat = 0; + noc_bank_table_init(MEM_IERISC_BANK_TO_NOC_SCRATCH); + risc_init(); mailboxes->slave_sync.all = RUN_SYNC_MSG_ALL_SLAVES_DONE; diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index fb3c6e566b3..ba91c04713b 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -11,7 +11,6 @@ #include "firmware_common.h" #include "tools/profiler/kernel_profiler.hpp" #include "risc_attribs.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "circular_buffer.h" #include "circular_buffer_init.h" @@ -40,6 +39,13 @@ uint32_t tt_l1_ptr *rta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *crta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used)); +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + #if defined(PROFILE_KERNEL) namespace kernel_profiler { uint32_t wIndex __attribute__((used)); @@ -79,6 +85,8 @@ int main(int argc, char *argv[]) { do_crt1((uint32_t tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH); + noc_bank_table_init(MEM_BANK_TO_NOC_SCRATCH); + risc_init(); // If NCRISC has IRAM it needs to halt before BRISC copies data from L1 to IRAM diff --git a/tt_metal/hw/firmware/src/slave_idle_erisc.cc b/tt_metal/hw/firmware/src/slave_idle_erisc.cc index 164313f27df..8e0b4500a7a 100644 --- a/tt_metal/hw/firmware/src/slave_idle_erisc.cc +++ b/tt_metal/hw/firmware/src/slave_idle_erisc.cc @@ -11,7 +11,6 @@ #include "firmware_common.h" #include "tools/profiler/kernel_profiler.hpp" #include "risc_attribs.h" -#include "generated_bank_to_noc_coord_mapping.h" #include "circular_buffer.h" #include "debug/waypoint.h" diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h index 4f68f18e9af..3ef1012727a 100644 --- a/tt_metal/hw/inc/blackhole/dev_mem_map.h +++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h @@ -41,6 +41,11 @@ #define MEM_NCRISC_LOCAL_SIZE (8 * 1024) #define MEM_TRISC_LOCAL_SIZE (4 * 1024) +// Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_TO_NOC_XY_SIZE 1024 +// Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_OFFSET_SIZE 1024 + ///////////// // Firmware/kernel code holes #define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 128) @@ -91,6 +96,9 @@ #define MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC0_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) #define MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SCRATCH (MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + ///////////// // Stack info // Increasing the stack size comes at the expense of less local memory for globals @@ -130,6 +138,9 @@ #define MEM_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE) #define MEM_SLAVE_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_SLAVE_IERISC_LOCAL_SIZE - MEM_SLAVE_IERISC_STACK_SIZE) +#define MEM_IERISC_BANK_TO_NOC_SCRATCH (MEM_SLAVE_IERISC_INIT_LOCAL_L1_BASE_SCRATCH + MEM_SLAVE_IERISC_LOCAL_SIZE) +#define MEM_IERISC_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + ///////////// // Padding/alignment restriction needed in linker scripts for erisc #define MEM_IERISC_KERNEL_PAD 32 diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h index e99d13af3d4..05d071dfdb4 100644 --- a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h @@ -26,6 +26,13 @@ struct address_map { static constexpr std::int32_t DATA_BUFFER_SIZE_ETH = 4 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE_NOC = 16 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE = 24 * 1024; + // Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + + // NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_XY_SIZE = 1024; + // Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + + // NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_OFFSET_SIZE = 1024; + // Kernel config buffer is WIP // Size is presently based on the old sizes of the RTAs + CB config + Sems static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_SIZE = 96 * 4 + 8 * 16; @@ -65,6 +72,12 @@ struct address_map { static_assert((ERISC_L1_UNRESERVED_BASE % 32) == 0); + // This scratch address is same as ERISC_L1_UNRESERVED_BASE, as the scratch space is used to copy data during + // runtime build, and is unused once FW copies the data to local memory during FW initialization. + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = + (ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE + 31) & ~31; + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = ERISC_MEM_BANK_TO_NOC_XY_SIZE + ERISC_MEM_BANK_OFFSET_SIZE; + static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = L1_EPOCH_Q_BASE + 4; // BIDIR Tunneling Kernel Space diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index f5ee832f60b..59f6fc28963 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -10,9 +10,7 @@ #include "chlkc_unpack_tile_dims.h" #define DATA_FORMATS_DEFINED #endif -#if __has_include("generated_bank_to_noc_coord_mapping.h") -#include "generated_bank_to_noc_coord_mapping.h" -#endif +#include #include @@ -37,9 +35,15 @@ constexpr uint8_t proc_type = static_cast diff --git a/tt_metal/hw/inc/firmware_common.h b/tt_metal/hw/inc/firmware_common.h index c292a7261a8..9f051b32abb 100644 --- a/tt_metal/hw/inc/firmware_common.h +++ b/tt_metal/hw/inc/firmware_common.h @@ -13,39 +13,17 @@ #include "dev_mem_map.h" #include "hostdevcommon/kernel_structs.h" #include "dev_msgs.h" +#include "noc/noc_parameters.h" +#include "debug/dprint.h" + +extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS]; +extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS]; +extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS]; +extern int32_t bank_to_l1_offset[NUM_L1_BANKS]; extern void kernel_init(uint32_t kernel_init); extern void kernel_launch(uint32_t kernel_base_addr); - -inline void l1_to_local_mem_copy(uint32_t* dst, uint32_t tt_l1_ptr* src, int32_t len) { -#pragma GCC unroll 0 - while (len >= 3) { - auto v0 = src[0], v1 = src[1], v2 = src[2]; - // 1) Make sure the optimizer does not think this is memcpy by - // hiding the pointer bookkeeping in an asm. - // 2) The scheduler doesn't know the above loads have 6 cycle - // latency. We emit the 3 bookkeeping adds as a single block - // in the load shadow before the stores. The optimizer will - // not be able to move these. - // 3) We don't need early clobbers here because of the +r - // constraint -- early clobbers would pessimize. - asm inline( - "addi %0,%0,3*%3\n\t" - "addi %1,%1,3*%3\n\t" - "addi %2,%2,-3" - : "+r"(src), "+r"(dst), "+r"(len) - : "i"(sizeof(v0))); - dst[-3] = v0, dst[-2] = v1, dst[-1] = v2; - } - // There are 0, 1 or 2 words of residue. This is smaller than a loop. - // We get smaller code layout by expecting the conditions to be true. - if (__builtin_expect(len >= 1, true)) { - dst[0] = src[0]; - if (__builtin_expect(len >= 2, true)) { - dst[1] = src[1]; - } - } -} +void l1_to_local_mem_copy(uint32_t* dst, uint32_t tt_l1_ptr* src, int32_t len); inline void do_crt1(uint32_t tt_l1_ptr* data_image) { // Clear bss. @@ -59,6 +37,18 @@ inline void do_crt1(uint32_t tt_l1_ptr* data_image) { l1_to_local_mem_copy(__ldm_data_start, data_image, __ldm_data_end - __ldm_data_start); } +inline void noc_bank_table_init(uint64_t mem_bank_to_noc_addr) { + int32_t dram_to_noc_size_bytes = sizeof(dram_bank_to_noc_xy); + l1_to_local_mem_copy((uint*)dram_bank_to_noc_xy, (uint tt_l1_ptr*)mem_bank_to_noc_addr, dram_to_noc_size_bytes >> 2); + int32_t l1_to_noc_size_bytes = sizeof(l1_bank_to_noc_xy); + l1_to_local_mem_copy((uint*)l1_bank_to_noc_xy, (uint tt_l1_ptr*)(mem_bank_to_noc_addr + dram_to_noc_size_bytes), l1_to_noc_size_bytes >> 2); + + int32_t dram_offsets_size_bytes = sizeof(bank_to_dram_offset); + l1_to_local_mem_copy((uint*)bank_to_dram_offset, (uint tt_l1_ptr*)(mem_bank_to_noc_addr + dram_to_noc_size_bytes + l1_to_noc_size_bytes), dram_offsets_size_bytes >> 2); + int32_t l1_offsets_size_bytes = sizeof(bank_to_l1_offset); + l1_to_local_mem_copy((uint*)bank_to_l1_offset, (uint tt_l1_ptr*)(mem_bank_to_noc_addr + dram_to_noc_size_bytes + l1_to_noc_size_bytes + dram_offsets_size_bytes), l1_offsets_size_bytes >> 2); +} + FORCE_INLINE uint32_t firmware_config_init( tt_l1_ptr mailboxes_t* const mailboxes, uint32_t core_type_index, uint32_t dispatch_class) { diff --git a/tt_metal/hw/inc/grayskull/dev_mem_map.h b/tt_metal/hw/inc/grayskull/dev_mem_map.h index ba2077838c2..d7d829e7392 100644 --- a/tt_metal/hw/inc/grayskull/dev_mem_map.h +++ b/tt_metal/hw/inc/grayskull/dev_mem_map.h @@ -40,15 +40,20 @@ #define MEM_NCRISC_LOCAL_SIZE (4 * 1024) #define MEM_TRISC_LOCAL_SIZE (2 * 1024) +// Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_TO_NOC_XY_SIZE 1024 +// Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_OFFSET_SIZE 1024 + #define NCRISC_HAS_IRAM 1 #define MEM_NCRISC_IRAM_BASE 0xFFC00000 #define MEM_NCRISC_IRAM_SIZE (16 * 1024) ///////////// // Firmware/kernel code holes -#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 416) +#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 624) // TODO: perhaps put NCRISC FW in the scratch area and free 1.5K after init (GS/WH) -#define MEM_NCRISC_FIRMWARE_SIZE 1616 +#define MEM_NCRISC_FIRMWARE_SIZE 1824 #define MEM_TRISC0_FIRMWARE_SIZE 1536 #define MEM_TRISC1_FIRMWARE_SIZE 1536 #define MEM_TRISC2_FIRMWARE_SIZE 1536 @@ -100,6 +105,9 @@ #define MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC0_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) #define MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SCRATCH (MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + ///////////// // Stack info // Increasing the stack size comes at the expense of less local memory for globals @@ -125,5 +133,7 @@ #define MEM_IERISC_MAP_END 0 #define MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH 0 #define MEM_IERISC_STACK_SIZE 0 +#define MEM_IERISC_BANK_TO_NOC_SCRATCH 0 +#define MEM_IERISC_BANK_TO_NOC_SIZE 0 #define MEM_IERISC_KERNEL_PAD 0 diff --git a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h index 0ad8580b15b..26332938fcb 100644 --- a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h +++ b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h @@ -37,6 +37,8 @@ struct address_map { static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = 0; static constexpr std::int32_t ERISC_L1_TUNNEL_BUFFER_SIZE = 0; + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = 0; + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = 0; static constexpr std::uint32_t RETRAIN_COUNT_ADDR = 0x1EDC; static constexpr std::uint32_t RETRAIN_FORCE_ADDR = 0x1EFC; diff --git a/tt_metal/hw/inc/wormhole/dev_mem_map.h b/tt_metal/hw/inc/wormhole/dev_mem_map.h index c107c20d4b9..0d9e1dd932c 100644 --- a/tt_metal/hw/inc/wormhole/dev_mem_map.h +++ b/tt_metal/hw/inc/wormhole/dev_mem_map.h @@ -41,13 +41,18 @@ #define MEM_NCRISC_LOCAL_SIZE (4 * 1024) #define MEM_TRISC_LOCAL_SIZE (2 * 1024) +// Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_TO_NOC_XY_SIZE 1024 +// Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + NUM_L1_BANKS) +#define MEM_BANK_OFFSET_SIZE 1024 + #define NCRISC_HAS_IRAM 1 #define MEM_NCRISC_IRAM_BASE 0xFFC00000 #define MEM_NCRISC_IRAM_SIZE (16 * 1024) ///////////// // Firmware/kernel code holes -#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 64) +#define MEM_BRISC_FIRMWARE_SIZE (5 * 1024 + 256) // TODO: perhaps put NCRISC FW in the scratch area and free 1.5K after init (GS/WH) #define MEM_NCRISC_FIRMWARE_SIZE 1536 #define MEM_TRISC0_FIRMWARE_SIZE 1536 @@ -102,6 +107,9 @@ #define MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC0_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) #define MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SCRATCH (MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE) +#define MEM_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + ///////////// // Stack info // Increasing the stack size comes at the expense of less local memory for globals @@ -137,6 +145,10 @@ #define MEM_IERISC_STACK_SIZE 1024 #define MEM_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE) +#define MEM_IERISC_BANK_TO_NOC_SCRATCH (MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH + MEM_IERISC_LOCAL_SIZE) +#define MEM_IERISC_BANK_TO_NOC_SIZE (MEM_BANK_TO_NOC_XY_SIZE + MEM_BANK_OFFSET_SIZE) + + ///////////// // Padding/alignment restriction needed in linker scripts for erisc #define MEM_IERISC_KERNEL_PAD 32 diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index 68e67eb9248..39d41601bef 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -26,6 +26,11 @@ struct address_map { static constexpr std::int32_t DATA_BUFFER_SIZE_ETH = 4 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE_NOC = 16 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE = 24 * 1024; + // Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_XY_SIZE = 1024; + // Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_OFFSET_SIZE = 1024; + // Kernel config buffer is WIP // Size is presently based on the old sizes of the RTAs + CB config + Sems static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_SIZE = 96 * 4 + 8 * 16; @@ -65,6 +70,12 @@ struct address_map { static_assert((ERISC_L1_UNRESERVED_BASE % 32) == 0); + // This scratch address is same as ERISC_L1_UNRESERVED_BASE, as the scratch space is used to copy data during + // runtime build, and is unused once FW copies the data to local memory during FW initialization. + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = + (ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE + 31) & ~31; + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = ERISC_MEM_BANK_TO_NOC_XY_SIZE + ERISC_MEM_BANK_OFFSET_SIZE; + static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = L1_EPOCH_Q_BASE + 4; // BIDIR Tunneling Kernel Space diff --git a/tt_metal/hw/toolchain/substitutes.cpp b/tt_metal/hw/toolchain/substitutes.cpp index a4e5feb40a0..45764316f8c 100644 --- a/tt_metal/hw/toolchain/substitutes.cpp +++ b/tt_metal/hw/toolchain/substitutes.cpp @@ -37,3 +37,34 @@ extern "C" void wzerorange(uint32_t* start, uint32_t* end) { start[-1] = 0; } } + +// Let the LTO decide if this needs to be inline. +void l1_to_local_mem_copy(uint32_t* dst, uint32_t __attribute__((rvtt_l1_ptr))* src, int32_t len) { +#pragma GCC unroll 0 + while (len >= 3) { + auto v0 = src[0], v1 = src[1], v2 = src[2]; + // 1) Make sure the optimizer does not think this is memcpy by + // hiding the pointer bookkeeping in an asm. + // 2) The scheduler doesn't know the above loads have 6 cycle + // latency. We emit the 3 bookkeeping adds as a single block + // in the load shadow before the stores. The optimizer will + // not be able to move these. + // 3) We don't need early clobbers here because of the +r + // constraint -- early clobbers would pessimize. + asm inline( + "addi %0,%0,3*%3\n\t" + "addi %1,%1,3*%3\n\t" + "addi %2,%2,-3" + : "+r"(src), "+r"(dst), "+r"(len) + : "i"(sizeof(v0))); + dst[-3] = v0, dst[-2] = v1, dst[-1] = v2; + } + // There are 0, 1 or 2 words of residue. This is smaller than a loop. + // We get smaller code layout by expecting the conditions to be true. + if (__builtin_expect(len >= 1, true)) { + dst[0] = src[0]; + if (__builtin_expect(len >= 2, true)) { + dst[1] = src[1]; + } + } +} diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index fe6953d4c23..c5a3c75660e 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -7,7 +7,6 @@ #include "tt_metal/device.hpp" #include "common/core_coord.hpp" #include "tt_metal/host_api.hpp" -#include "tt_metal/jit_build/genfiles.hpp" #include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/trace/trace.hpp" #include "tt_metal/common/core_descriptor.hpp" @@ -28,6 +27,7 @@ #include "tt_metal/impl/sub_device/sub_device_types.hpp" #include "tt_metal/tt_stl/span.hpp" #include "tt_metal/types.hpp" +#include "noc/noc_parameters.h" // FIXME: ARCH_NAME specific #include "eth_l1_address_map.h" @@ -407,13 +407,36 @@ void Device::build_firmware() { log_debug(tt::LogMetal, "Building base firmware for device {}", this->id_); ZoneScoped; - this->generate_device_headers(this->build_env_.get_out_firmware_root_path()); jit_build_set(this->firmware_build_states_, nullptr); } +void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord phys_core) +{ + const uint32_t dram_to_noc_sz_in_bytes = dram_bank_to_noc_xy_.size() * sizeof(uint16_t); + const uint32_t l1_to_noc_sz_in_bytes = l1_bank_to_noc_xy_.size() * sizeof(uint16_t); + const uint32_t dram_offset_sz_in_bytes = dram_bank_offset_map_.size() * sizeof(int32_t); + const uint32_t l1_offset_sz_in_bytes = l1_bank_offset_map_.size() * sizeof(int32_t); + + const uint64_t mem_bank_to_noc_addr = hal.get_dev_addr(core_type, HalL1MemAddrType::BANK_TO_NOC_SCRATCH); + const uint32_t mem_bank_to_noc_size = hal.get_dev_size(core_type, HalL1MemAddrType::BANK_TO_NOC_SCRATCH); + + TT_ASSERT((dram_to_noc_sz_in_bytes + l1_to_noc_sz_in_bytes + dram_offset_sz_in_bytes + l1_offset_sz_in_bytes) <= mem_bank_to_noc_size, + "Size of bank_to_noc table is greater than available space"); + + tt::Cluster::instance().write_core(&dram_bank_to_noc_xy_[0], dram_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), mem_bank_to_noc_addr); + uint64_t l1_noc_addr = mem_bank_to_noc_addr + dram_to_noc_sz_in_bytes; + tt::Cluster::instance().write_core(&l1_bank_to_noc_xy_[0], l1_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), l1_noc_addr); + + uint64_t dram_offset_addr = l1_noc_addr + l1_to_noc_sz_in_bytes; + tt::Cluster::instance().write_core(&dram_bank_offset_map_[0], dram_offset_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), dram_offset_addr); + uint64_t l1_offset_addr = dram_offset_addr + dram_offset_sz_in_bytes; + tt::Cluster::instance().write_core(&l1_bank_offset_map_[0], l1_offset_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), l1_offset_addr); +} + void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg) { ZoneScoped; + this->initialize_device_bank_to_noc_tables(core_type, phys_core); uint32_t core_type_idx = hal.get_programmable_core_type_index(core_type); uint32_t processor_class_count = hal.get_processor_classes_count(core_type); @@ -2948,6 +2971,7 @@ bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t t this->initialize_cluster(); this->initialize_default_sub_device_state(l1_small_size, trace_region_size, l1_bank_remap); this->initialize_build(); + this->generate_device_bank_to_noc_tables(); // For minimal setup, don't initialize FW, watcher, dprint. They won't work if we're attaching to a hung chip. if (minimal) @@ -3553,37 +3577,48 @@ void Device::MarkAllocationsSafe() { tt::tt_metal::allocator::mark_allocations_safe(*this->get_initialized_allocator()); } -void Device::generate_device_headers(const std::string &path) const +void Device::generate_device_bank_to_noc_tables() { const size_t num_dram_banks = this->num_banks(BufferType::DRAM); - const size_t num_dram_banks_pow2 = std::pow(2, std::ceil(std::log2(num_dram_banks))); std::vector dram_noc_coord_per_bank(num_dram_banks); - std::vector dram_offsets_per_bank(num_dram_banks); + dram_bank_offset_map_.clear(); + dram_bank_offset_map_.resize(num_dram_banks); for (unsigned bank_id = 0; bank_id < num_dram_banks; bank_id++) { dram_noc_coord_per_bank[bank_id] = this->dram_core_from_dram_channel(this->dram_channel_from_bank_id(bank_id)); - dram_offsets_per_bank[bank_id] = this->bank_offset(BufferType::DRAM, bank_id); + dram_bank_offset_map_[bank_id] = this->bank_offset(BufferType::DRAM, bank_id); } const size_t num_l1_banks = this->num_banks(BufferType::L1); - const size_t num_l1_banks_pow2 = std::pow(2, std::ceil(std::log2(num_l1_banks))); std::vector l1_noc_coord_per_bank(num_l1_banks); - std::vector l1_offset_per_bank(num_l1_banks); + l1_bank_offset_map_.clear(); + l1_bank_offset_map_.resize(num_l1_banks); for (unsigned bank_id = 0; bank_id < num_l1_banks; bank_id++) { l1_noc_coord_per_bank[bank_id] = this->worker_core_from_logical_core(this->logical_core_from_bank_id(bank_id)); - l1_offset_per_bank[bank_id] = this->bank_offset(BufferType::L1, bank_id); + l1_bank_offset_map_[bank_id] = this->bank_offset(BufferType::L1, bank_id); } const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(this->id()); - // Generate header file in proper location - jit_build_genfiles_bank_to_noc_coord_descriptor ( - path, - soc_d.grid_size, - dram_noc_coord_per_bank, - dram_offsets_per_bank, - l1_noc_coord_per_bank, - l1_offset_per_bank, - this->get_allocator_alignment() - ); + dram_bank_to_noc_xy_.clear(); + dram_bank_to_noc_xy_.reserve(tt::tt_metal::hal.get_num_nocs() * dram_noc_coord_per_bank.size()); + for (unsigned int noc = 0; noc < tt::tt_metal::hal.get_num_nocs(); noc++) { + for (unsigned int bank_id = 0; bank_id < dram_noc_coord_per_bank.size(); bank_id++) { + uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, dram_noc_coord_per_bank[bank_id].x); + uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, dram_noc_coord_per_bank[bank_id].y); + uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << NOC_COORD_REG_OFFSET; + dram_bank_to_noc_xy_.push_back(xy); + } + } + + l1_bank_to_noc_xy_.clear(); + l1_bank_to_noc_xy_.reserve(tt::tt_metal::hal.get_num_nocs() * l1_noc_coord_per_bank.size()); + for (unsigned int noc = 0; noc < tt::tt_metal::hal.get_num_nocs(); noc++) { + for (unsigned int bank_id = 0; bank_id < l1_noc_coord_per_bank.size(); bank_id++) { + uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, l1_noc_coord_per_bank[bank_id].x); + uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, l1_noc_coord_per_bank[bank_id].y); + uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << NOC_COORD_REG_OFFSET; + l1_bank_to_noc_xy_.push_back(xy); + } + } } size_t Device::get_device_kernel_defines_hash() { diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 045a1097aac..616a831e046 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -231,7 +231,7 @@ class Device { // machine inf float sfpu_inf() const; - void generate_device_headers(const std::string &path) const; + void generate_device_bank_to_noc_tables(); const JitBuildEnv& build_env() const { return this->build_env_; } const string build_firmware_target_path(uint32_t programmable_core, uint32_t processor_class, int i) const; const string build_kernel_target_path(uint32_t programmable_core, uint32_t processor_class, int i, const string& kernel_name) const; @@ -259,6 +259,7 @@ class Device { void initialize_build(); void initialize_device_kernel_defines(); void build_firmware(); + void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord phys_core); void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg); void reset_cores(); void initialize_and_launch_firmware(); @@ -396,6 +397,11 @@ class Device { SubDeviceManagerId next_sub_device_manager_id_ = {0}; SubDeviceManagerId default_sub_device_manager_id_ = {0}; detail::SubDeviceManager *default_sub_device_manager_ = nullptr; + + std::vector dram_bank_offset_map_; + std::vector l1_bank_offset_map_; + std::vector dram_bank_to_noc_xy_; + std::vector l1_bank_to_noc_xy_; }; } // namespace v0 diff --git a/tt_metal/impl/kernels/kernel.cpp b/tt_metal/impl/kernels/kernel.cpp index d21d2c1735d..a3f67470d21 100644 --- a/tt_metal/impl/kernels/kernel.cpp +++ b/tt_metal/impl/kernels/kernel.cpp @@ -335,7 +335,6 @@ void ComputeKernel::set_build_options(JitBuildOptions &build_options) const { void DataMovementKernel::generate_binaries(Device *device, JitBuildOptions &build_options) const { jit_build_genfiles_kernel_include(device->build_env(), *this, this->kernel_src_); - device->generate_device_headers(build_options.path); uint32_t tensix_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int riscv_id = static_cast::type>(this->config_.processor); @@ -344,7 +343,6 @@ void DataMovementKernel::generate_binaries(Device *device, JitBuildOptions &buil void EthernetKernel::generate_binaries(Device *device, JitBuildOptions &build_options) const { jit_build_genfiles_kernel_include(device->build_env(), *this, this->kernel_src_); - device->generate_device_headers(build_options.path); uint32_t erisc_core_type = hal.get_programmable_core_type_index(this->get_kernel_programmable_core_type()); uint32_t dm_class_idx = magic_enum::enum_integer(HalProcessorClassType::DM); int erisc_id = magic_enum::enum_integer(this->config_.processor); diff --git a/tt_metal/jit_build/build.hpp b/tt_metal/jit_build/build.hpp index 45c153439f0..ccd4a7860d2 100644 --- a/tt_metal/jit_build/build.hpp +++ b/tt_metal/jit_build/build.hpp @@ -50,7 +50,6 @@ class JitBuildEnv { tt::ARCH get_arch() const { return arch_; } const string& get_root_path() const { return root_; } const string& get_out_root_path() const { return out_root_; } - const string& get_out_firmware_root_path() const { return out_firmware_root_; } const string& get_out_kernel_root_path() const { return out_kernel_root_; } private: diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp index a008db74e1e..ab920c1d1b0 100644 --- a/tt_metal/jit_build/genfiles.cpp +++ b/tt_metal/jit_build/genfiles.cpp @@ -451,128 +451,4 @@ void jit_build_genfiles_descriptors(const JitBuildEnv& env, JitBuildOptions& opt } } -std::string generate_bank_to_noc_coord_descriptor_string( - tt_xy_pair grid_size, - std::vector& dram_bank_map, - std::vector& dram_bank_offset_map, - std::vector& l1_bank_map, - std::vector& l1_bank_offset_map, - uint32_t allocator_alignment) { - stringstream ss; - - ss << "// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc." << endl; - ss << "//" << endl; - ss << "// SPDX-License-Identifier: Apache-2.0" << endl; - ss << endl; - ss << "/*" << endl; - ss << " * This file is autogenerated by tt-metal runtime" << endl; - ss << " * DO NOT EDIT" << endl; - ss << " * This file contains values that are visible to the device compiled code." << endl; - ss << " * CAREFUL: when included in the FW_BUILD, it defines global variables." << endl; - ss << " * When included in KERNEL_BUILD, it declares global variables." << endl; - ss << " */" << endl; - ss << endl; - ss << "#pragma once" << endl; - ss << endl; - ss << "#include " << endl; - ss << endl; - - ss << "static_assert(NUM_NOCS == 2);" << endl; - ss << endl; - - ss << "#ifdef KERNEL_BUILD" << endl; - ss << endl; - ss << "extern uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS];" << endl; - ss << "extern int32_t bank_to_dram_offset[NUM_DRAM_BANKS];" << endl; - ss << "extern uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS];" << endl; - ss << "extern int32_t bank_to_l1_offset[NUM_L1_BANKS];" << endl; - - ss << endl; - ss << "#else // !KERNEL_BUILD (FW_BUILD)" << endl; - ss << endl; - - ss << "uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)) = {" << endl; - for (unsigned int noc = 0; noc < 2; noc++) { - ss << " {" - << "\t// noc=" << noc << endl; - for (unsigned int bank_id = 0; bank_id < dram_bank_map.size(); bank_id++) { - uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, grid_size.x, dram_bank_map[bank_id].x); - uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, grid_size.y, dram_bank_map[bank_id].y); - ss << " (((" << noc_y << " << NOC_ADDR_NODE_ID_BITS) | " << noc_x << ") << NOC_COORD_REG_OFFSET)," - << "\t// NOC_X=" << noc_x << " NOC_Y=" << noc_y << endl; - } - ss << " }," << endl; - } - ss << "};" << endl; - ss << endl; - ss << "int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)) = {" << endl; - for (unsigned int bank_id = 0; bank_id < dram_bank_map.size(); bank_id++) { - ss << " " << dram_bank_offset_map[bank_id] << "," << endl; - } - ss << "};" << endl; - ss << endl; - - ss << "uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)) = {" << endl; - for (unsigned int noc = 0; noc < 2; noc++) { - ss << " {" - << "\t// noc=" << noc << endl; - for (unsigned int bank_id = 0; bank_id < l1_bank_map.size(); bank_id++) { - uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, grid_size.x, l1_bank_map[bank_id].x); - uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, grid_size.y, l1_bank_map[bank_id].y); - ss << " (((" << noc_y << " << NOC_ADDR_NODE_ID_BITS) | " << noc_x << ") << NOC_COORD_REG_OFFSET)," - << "\t// NOC_X=" << noc_x << " NOC_Y=" << noc_y << endl; - } - ss << " }," << endl; - } - ss << "};" << endl; - ss << endl; - ss << "int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)) = {" << endl; - for (unsigned int bank_id = 0; bank_id < l1_bank_map.size(); bank_id++) { - ss << " " << l1_bank_offset_map[bank_id] << "," << endl; - } - ss << "};" << endl; - ss << endl; - - ss << "#endif // FW_BUILD" << endl; - - return ss.str(); -} -void jit_build_genfiles_bank_to_noc_coord_descriptor( - const string& path, - tt_xy_pair grid_size, - std::vector& dram_bank_map, - std::vector& dram_bank_offset_map, - std::vector& l1_bank_map, - std::vector& l1_bank_offset_map, - uint32_t allocator_alignment) { - string output_string = generate_bank_to_noc_coord_descriptor_string( - grid_size, - dram_bank_map, - dram_bank_offset_map, - l1_bank_map, - l1_bank_offset_map, - allocator_alignment); - - fs::create_directories(path + "/brisc"); - ofstream file_stream_br(path + "/brisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_br << output_string; - file_stream_br.close(); - fs::create_directories(path + "/ncrisc"); - ofstream file_stream_nc(path + "/ncrisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_nc << output_string; - file_stream_nc.close(); - fs::create_directories(path + "/erisc"); - ofstream file_stream_ec(path + "/erisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_ec << output_string; - file_stream_ec.close(); - fs::create_directories(path + "/idle_erisc"); - ofstream file_stream_iec(path + "/idle_erisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_iec << output_string; - file_stream_iec.close(); - fs::create_directories(path + "/slave_idle_erisc"); - ofstream file_stream_siec(path + "/slave_idle_erisc/generated_bank_to_noc_coord_mapping.h"); - file_stream_siec << output_string; - file_stream_siec.close(); -} - } // namespace tt::tt_metal diff --git a/tt_metal/jit_build/genfiles.hpp b/tt_metal/jit_build/genfiles.hpp index 4dee07a44ab..c21459daabd 100644 --- a/tt_metal/jit_build/genfiles.hpp +++ b/tt_metal/jit_build/genfiles.hpp @@ -21,15 +21,6 @@ void jit_build_genfiles_kernel_include( void jit_build_genfiles_triscs_src( const JitBuildEnv& env, const JitBuildSettings& settings, const KernelSource& kernel_src); -void jit_build_genfiles_bank_to_noc_coord_descriptor( - const std::string& path, - tt_xy_pair grid_size, - std::vector& dram_bank_map, - std::vector& dram_bank_offset_map, - std::vector& l1_bank_map, - std::vector& l1_bank_offset_map, - uint32_t allocator_alignment); - void jit_build_genfiles_descriptors(const JitBuildEnv& env, JitBuildOptions& options); } // namespace tt::tt_metal diff --git a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp index 021f58f1075..2fe01d1cd57 100644 --- a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp @@ -46,6 +46,8 @@ HalCoreInfoType create_active_eth_mem_map() { GET_ETH_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::FW_VERSION_ADDR)] = eth_l1_mem::address_map::FW_VERSION_ADDR; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = + eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -65,6 +67,8 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(std::uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::FW_VERSION_ADDR)] = sizeof(std::uint32_t); + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = + eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumEthDispatchClasses - 1); std::vector processor_types(1); diff --git a/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp index f7f91ed7f44..72ba9e91a22 100644 --- a/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_idle_eth.cpp @@ -49,6 +49,7 @@ HalCoreInfoType create_idle_eth_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::GO_MSG)] = GET_IERISC_MAILBOX_ADDRESS_HOST(go_message); mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_IERISC_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_IERISC_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -66,6 +67,7 @@ HalCoreInfoType create_idle_eth_mem_map() { ; mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(std::uint32_t); + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_IERISC_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumEthDispatchClasses); std::vector processor_types(1); diff --git a/tt_metal/llrt/blackhole/bh_hal_tensix.cpp b/tt_metal/llrt/blackhole/bh_hal_tensix.cpp index d0414dcfbc0..eb17f10bf11 100644 --- a/tt_metal/llrt/blackhole/bh_hal_tensix.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_tensix.cpp @@ -46,6 +46,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::LOCAL)] = MEM_LOCAL_BASE; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -62,6 +63,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::LOCAL)] = MEM_TRISC_LOCAL_SIZE; // TRISC, BRISC, or NCRISC? + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumTensixDispatchClasses); std::vector processor_types; diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp index 5477beeec65..71a889179b8 100644 --- a/tt_metal/llrt/grayskull/gs_hal.cpp +++ b/tt_metal/llrt/grayskull/gs_hal.cpp @@ -61,6 +61,7 @@ void Hal::initialize_gs() { mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::LOCAL)] = MEM_LOCAL_BASE; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -77,6 +78,7 @@ void Hal::initialize_gs() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::LOCAL)] = MEM_TRISC_LOCAL_SIZE; // TRISC, BRISC, or NCRISC? + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumTensixDispatchClasses); std::vector processor_types; diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/llrt/hal.hpp index f7da19e2f97..80e88002696 100644 --- a/tt_metal/llrt/hal.hpp +++ b/tt_metal/llrt/hal.hpp @@ -51,6 +51,7 @@ enum class HalL1MemAddrType : uint8_t { LAUNCH_MSG_BUFFER_RD_PTR, FW_VERSION_ADDR, // Really only applicable to active eth core right now LOCAL, + BANK_TO_NOC_SCRATCH, COUNT // Keep this last so it always indicates number of enum options }; diff --git a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp index 0d1241020c5..c0af4cc0bd7 100644 --- a/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_active_eth.cpp @@ -43,6 +43,8 @@ HalCoreInfoType create_active_eth_mem_map() { GET_ETH_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::FW_VERSION_ADDR)] = eth_l1_mem::address_map::FW_VERSION_ADDR; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = + eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -62,6 +64,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::FW_VERSION_ADDR)] = sizeof(std::uint32_t); + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumEthDispatchClasses); std::vector processor_types(1); diff --git a/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp b/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp index a2ce00faf43..6a5b617a3d2 100644 --- a/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_idle_eth.cpp @@ -49,6 +49,7 @@ HalCoreInfoType create_idle_eth_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::GO_MSG)] = GET_IERISC_MAILBOX_ADDRESS_HOST(go_message); mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_IERISC_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_IERISC_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -66,6 +67,7 @@ HalCoreInfoType create_idle_eth_mem_map() { ; mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(std::uint32_t); + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_IERISC_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumEthDispatchClasses); std::vector processor_types(1); diff --git a/tt_metal/llrt/wormhole/wh_hal_tensix.cpp b/tt_metal/llrt/wormhole/wh_hal_tensix.cpp index 7de8185bacb..e4d6c42981e 100644 --- a/tt_metal/llrt/wormhole/wh_hal_tensix.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_tensix.cpp @@ -47,6 +47,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = GET_MAILBOX_ADDRESS_HOST(launch_msg_rd_ptr); mem_map_bases[static_cast(HalL1MemAddrType::LOCAL)] = MEM_LOCAL_BASE; + mem_map_bases[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SCRATCH; std::vector mem_map_sizes; mem_map_sizes.resize(static_cast(HalL1MemAddrType::COUNT)); @@ -63,6 +64,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR)] = sizeof(std::uint32_t); mem_map_sizes[static_cast(HalL1MemAddrType::LOCAL)] = MEM_TRISC_LOCAL_SIZE; // TRISC, BRISC, or NCRISC? + mem_map_sizes[static_cast(HalL1MemAddrType::BANK_TO_NOC_SCRATCH)] = MEM_BANK_TO_NOC_SIZE; std::vector> processor_classes(NumTensixDispatchClasses); std::vector processor_types;