diff --git a/tt_metal/hostdevcommon/common_runtime_address_map.h b/tt_metal/hostdevcommon/common_runtime_address_map.h index 3625a069934..7de1b07fbe6 100644 --- a/tt_metal/hostdevcommon/common_runtime_address_map.h +++ b/tt_metal/hostdevcommon/common_runtime_address_map.h @@ -8,6 +8,7 @@ #include "common_values.hpp" #include "dev_mem_map.h" #include "noc/noc_parameters.h" +#include "hostdevcommon/profiler_common.h" /* * This file contains addresses that are visible to both host and device compiled code. @@ -20,46 +21,23 @@ constexpr static std::uint32_t DRAM_ALIGNMENT = NOC_DRAM_READ_ALIGNMENT_BYTES >= constexpr static std::uint32_t DRAM_BARRIER_SIZE = ((sizeof(uint32_t) + DRAM_ALIGNMENT - 1) / DRAM_ALIGNMENT) * DRAM_ALIGNMENT; constexpr static std::uint32_t DRAM_UNRESERVED_BASE = DRAM_BARRIER_BASE + DRAM_BARRIER_SIZE; // Start of unreserved space -constexpr static std::uint32_t L1_ALIGNMENT = NOC_L1_READ_ALIGNMENT_BYTES >= NOC_L1_WRITE_ALIGNMENT_BYTES ? NOC_L1_READ_ALIGNMENT_BYTES : NOC_L1_WRITE_ALIGNMENT_BYTES; - // Take max alignment to satisfy NoC rd/wr constraints // Tensix/Eth -> PCIe/DRAM src and dst addrs must be L1_ALIGNMENT aligned // PCIe/DRAM -> Tensix/Eth src and dst addrs must be DRAM_ALIGNMENT aligned // Tensix/Eth <-> Tensix/Eth src and dst addrs must be L1_ALIGNMENT aligned constexpr static std::uint32_t ALLOCATOR_ALIGNMENT = DRAM_ALIGNMENT >= L1_ALIGNMENT ? DRAM_ALIGNMENT : L1_ALIGNMENT; -// TODO: these could be moved to even lower addresses -- 5 RISC-V hexes combined don't need 100 KB -constexpr static std::uint32_t PROFILER_L1_MARKER_UINT32_SIZE = 2; -constexpr static std::uint32_t PROFILER_L1_MARKER_BYTES_SIZE = PROFILER_L1_MARKER_UINT32_SIZE * sizeof(uint32_t); - -constexpr static std::uint32_t PROFILER_L1_PROGRAM_ID_COUNT = 2; -constexpr static std::uint32_t PROFILER_L1_GUARANTEED_MARKER_COUNT = 4; - -constexpr static std::uint32_t PROFILER_L1_OPTIONAL_MARKER_COUNT = 250; -constexpr static std::uint32_t PROFILER_L1_OP_MIN_OPTIONAL_MARKER_COUNT = 2; - -constexpr static std::uint32_t PROFILER_L1_VECTOR_SIZE = (PROFILER_L1_OPTIONAL_MARKER_COUNT + PROFILER_L1_GUARANTEED_MARKER_COUNT + PROFILER_L1_PROGRAM_ID_COUNT) * PROFILER_L1_MARKER_UINT32_SIZE; -constexpr static std::uint32_t PROFILER_L1_BUFFER_SIZE = PROFILER_L1_VECTOR_SIZE * sizeof(uint32_t); - -constexpr static std::uint32_t PROFILER_L1_BUFFER_BR = MEM_MAP_END; -constexpr static std::uint32_t PROFILER_L1_BUFFER_NC = PROFILER_L1_BUFFER_BR + PROFILER_L1_BUFFER_SIZE; -constexpr static std::uint32_t PROFILER_L1_BUFFER_T0 = PROFILER_L1_BUFFER_NC + PROFILER_L1_BUFFER_SIZE; -constexpr static std::uint32_t PROFILER_L1_BUFFER_T1 = PROFILER_L1_BUFFER_T0 + PROFILER_L1_BUFFER_SIZE; -constexpr static std::uint32_t PROFILER_L1_BUFFER_T2 = PROFILER_L1_BUFFER_T1 + PROFILER_L1_BUFFER_SIZE; - -constexpr static std::uint32_t PROFILER_L1_END_ADDRESS = PROFILER_L1_BUFFER_T2 + PROFILER_L1_BUFFER_SIZE; - +// TODO: move these out of the memory map into profiler code constexpr static std::uint32_t PROFILER_OP_SUPPORT_COUNT = 1000; -constexpr static std::uint32_t PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC = PROFILER_L1_MARKER_UINT32_SIZE * (PROFILER_L1_PROGRAM_ID_COUNT + PROFILER_L1_GUARANTEED_MARKER_COUNT + PROFILER_L1_OP_MIN_OPTIONAL_MARKER_COUNT) * PROFILER_OP_SUPPORT_COUNT; +constexpr static std::uint32_t PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC = kernel_profiler::PROFILER_L1_MARKER_UINT32_SIZE * (kernel_profiler::PROFILER_L1_PROGRAM_ID_COUNT + kernel_profiler::PROFILER_L1_GUARANTEED_MARKER_COUNT + kernel_profiler::PROFILER_L1_OP_MIN_OPTIONAL_MARKER_COUNT) * PROFILER_OP_SUPPORT_COUNT; constexpr static std::uint32_t PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC = PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC * sizeof(uint32_t); -constexpr static std::uint32_t PROFILER_RISC_COUNT = 5; -static_assert (PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC > PROFILER_L1_BUFFER_SIZE); +static_assert (PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC > kernel_profiler::PROFILER_L1_BUFFER_SIZE); // Kernel config buffer is WIP // Size is presently based on the old sizes of the RTAs + CB config + Sems // plus some extra space freed up in the mem map -constexpr static std::uint32_t L1_KERNEL_CONFIG_BASE = PROFILER_L1_END_ADDRESS; +constexpr static std::uint32_t L1_KERNEL_CONFIG_BASE = MEM_MAP_END; constexpr static std::uint32_t L1_KERNEL_CONFIG_SIZE = 4 * 1024 + 256 + 128 + 512; constexpr static std::uint32_t IDLE_ERISC_L1_KERNEL_CONFIG_BASE = 32 * 1024; @@ -67,11 +45,7 @@ constexpr static std::uint32_t IDLE_ERISC_L1_KERNEL_CONFIG_BASE = 32 * 1024; constexpr static std::uint32_t NUM_CIRCULAR_BUFFERS = 32; constexpr static std::uint32_t UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG = 4; -constexpr static std::uint32_t PROFILER_L1_CONTROL_VECTOR_SIZE = 32; -constexpr static std::uint32_t PROFILER_L1_CONTROL_BUFFER_SIZE = PROFILER_L1_CONTROL_VECTOR_SIZE * sizeof(uint32_t); -constexpr static std::uint32_t PROFILER_L1_BUFFER_CONTROL = L1_KERNEL_CONFIG_BASE + L1_KERNEL_CONFIG_SIZE; - -constexpr static std::uint32_t L1_UNRESERVED_BASE = ((PROFILER_L1_BUFFER_CONTROL + PROFILER_L1_CONTROL_BUFFER_SIZE - 1) | (DRAM_ALIGNMENT - 1)) + 1; +constexpr static std::uint32_t L1_UNRESERVED_BASE = ((L1_KERNEL_CONFIG_BASE + L1_KERNEL_CONFIG_SIZE - 1) | (DRAM_ALIGNMENT - 1)) + 1; constexpr static std::uint32_t ERISC_L1_UNRESERVED_BASE = L1_UNRESERVED_BASE; // Start of unreserved space diff --git a/tt_metal/hostdevcommon/profiler_common.h b/tt_metal/hostdevcommon/profiler_common.h index ce4a7325f18..2cc3cf6dd2b 100644 --- a/tt_metal/hostdevcommon/profiler_common.h +++ b/tt_metal/hostdevcommon/profiler_common.h @@ -24,18 +24,16 @@ namespace kernel_profiler{ enum ControlBuffer { - HOST_BUFFER_END_INDEX_BR, + HOST_BUFFER_END_INDEX_BR_ER, HOST_BUFFER_END_INDEX_NC, HOST_BUFFER_END_INDEX_T0, HOST_BUFFER_END_INDEX_T1, HOST_BUFFER_END_INDEX_T2, - HOST_BUFFER_END_INDEX_ER, - DEVICE_BUFFER_END_INDEX_BR, + DEVICE_BUFFER_END_INDEX_BR_ER, DEVICE_BUFFER_END_INDEX_NC, DEVICE_BUFFER_END_INDEX_T0, DEVICE_BUFFER_END_INDEX_T1, DEVICE_BUFFER_END_INDEX_T2, - DEVICE_BUFFER_END_INDEX_ER, FW_RESET_H, FW_RESET_L, DRAM_PROFILER_ADDRESS, @@ -47,6 +45,15 @@ namespace kernel_profiler{ PROFILER_DONE, }; - + // TODO: use data types in profile_msg_t rather than addresses/sizes + constexpr static std::uint32_t PROFILER_L1_CONTROL_VECTOR_SIZE = 32; + constexpr static std::uint32_t PROFILER_L1_CONTROL_BUFFER_SIZE = PROFILER_L1_CONTROL_VECTOR_SIZE * sizeof(uint32_t); + constexpr static std::uint32_t PROFILER_L1_MARKER_UINT32_SIZE = 2; + constexpr static std::uint32_t PROFILER_L1_PROGRAM_ID_COUNT = 2; + constexpr static std::uint32_t PROFILER_L1_GUARANTEED_MARKER_COUNT = 4; + constexpr static std::uint32_t PROFILER_L1_OPTIONAL_MARKER_COUNT = 250; + constexpr static std::uint32_t PROFILER_L1_OP_MIN_OPTIONAL_MARKER_COUNT = 2; + constexpr static std::uint32_t PROFILER_L1_VECTOR_SIZE = (PROFILER_L1_OPTIONAL_MARKER_COUNT + PROFILER_L1_GUARANTEED_MARKER_COUNT + PROFILER_L1_PROGRAM_ID_COUNT) * PROFILER_L1_MARKER_UINT32_SIZE; + constexpr static std::uint32_t PROFILER_L1_BUFFER_SIZE = PROFILER_L1_VECTOR_SIZE * sizeof(uint32_t); } diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h index 6e02af6069d..828f2c33c75 100644 --- a/tt_metal/hw/inc/blackhole/dev_mem_map.h +++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h @@ -53,7 +53,8 @@ #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small -#define MEM_MAILBOX_END (MEM_MAILBOX_BASE + 1356) +#define MEM_MAILBOX_SIZE 5 * 4 * 512 + 4 * 32 + 1364 +#define MEM_MAILBOX_END (MEM_MAILBOX_BASE + MEM_MAILBOX_SIZE) #define MEM_IERISC_MAILBOX_BASE 1024 #define MEM_IERISC_MAILBOX_END (MEM_IERISC_MAILBOX_BASE + 128) #define MEM_ZEROS_BASE ((MEM_MAILBOX_END + 31) & ~31) diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h index 7990e70cf8c..f2d7ac382d6 100644 --- a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h @@ -60,16 +60,9 @@ struct address_map { // erisc early exit functionality re-uses mailboxes_t::ncrisc_halt_msg_t::stack_save memory static constexpr std::int32_t ERISC_MEM_MAILBOX_STACK_SAVE = ERISC_MEM_MAILBOX_BASE + 4; - // Kernel config buffer is WIP - // Size is presently based on the old sizes of the RTAs + CB config + Sems - static constexpr std::uint32_t PROFILER_L1_BUFFER_ER = ERISC_MEM_MAILBOX_BASE + 288 + 256 + 16; - static constexpr std::uint32_t PROFILER_L1_BUFFER_CONTROL = PROFILER_L1_BUFFER_ER + PROFILER_L1_BUFFER_SIZE; - - static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_BASE = PROFILER_L1_BUFFER_CONTROL + PROFILER_L1_CONTROL_BUFFER_SIZE; - - static_assert((PROFILER_L1_BUFFER_ER % 32) == 0); - static_assert((PROFILER_L1_BUFFER_CONTROL % 32) == 0); + static constexpr std::uint32_t ERISC_MEM_MAILBOX_END = ERISC_MEM_MAILBOX_BASE + 288 + 256 + 16 + (32 + 512) * 4; + static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_BASE = ERISC_MEM_MAILBOX_END; static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE; static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_L1_LOADING_SIZE - ERISC_L1_UNRESERVED_BASE; diff --git a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h index 8959919939f..fe3536744a2 100644 --- a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h +++ b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h @@ -43,4 +43,6 @@ #define NOC_DRAM_READ_ALIGNMENT_BYTES 64 #define NOC_DRAM_WRITE_ALIGNMENT_BYTES 16 +#define L1_ALIGNMENT (static_cast(NOC_L1_READ_ALIGNMENT_BYTES >= NOC_L1_WRITE_ALIGNMENT_BYTES ? NOC_L1_READ_ALIGNMENT_BYTES : NOC_L1_WRITE_ALIGNMENT_BYTES)) + #endif diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index 371680eb197..72a54f59a98 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -13,7 +13,7 @@ #include "core_config.h" #include "noc/noc_parameters.h" #include "dev_mem_map.h" -#include "eth_l1_address_map.h" +#include "hostdevcommon/profiler_common.h" // TODO: move these to processor specific files #if defined(COMPILE_FOR_ERISC) @@ -195,6 +195,12 @@ struct debug_ring_buf_msg_t { uint32_t data[DEBUG_RING_BUFFER_ELEMENTS]; }; +struct debug_stack_usage_t { + volatile uint16_t max_usage[DebugNumUniqueRiscs]; + volatile uint16_t watcher_kernel_id[DebugNumUniqueRiscs]; + volatile uint16_t pad[16 - DebugNumUniqueRiscs * 2]; +}; + constexpr static std::uint32_t DPRINT_BUFFER_SIZE = 204; // per thread // TODO: when device specific headers specify number of processors // (and hal abstracts them on host), get these from there @@ -204,25 +210,14 @@ constexpr static std::uint32_t DPRINT_BUFFERS_COUNT = 1; constexpr static std::uint32_t DPRINT_BUFFERS_COUNT = 5; #endif -// TODO: w/ the hal, this can come from core specific defines -constexpr static std::uint32_t MAX_RISCV_PER_CORE = 5; - -struct dprint_buf_msg_t { - uint8_t data[DPRINT_BUFFERS_COUNT][DPRINT_BUFFER_SIZE]; - uint32_t pad; // to 1024 bytes -}; - -struct debug_stack_usage_t { - volatile uint16_t max_usage[DebugNumUniqueRiscs]; - volatile uint16_t watcher_kernel_id[DebugNumUniqueRiscs]; - volatile uint16_t pad[16 - DebugNumUniqueRiscs * 2]; -}; - enum watcher_enable_msg_t { WatcherDisabled = 2, WatcherEnabled = 3, }; +// TODO: w/ the hal, this can come from core specific defines +constexpr static std::uint32_t MAX_RISCV_PER_CORE = 5; + struct watcher_msg_t { volatile uint32_t enable; struct debug_waypoint_msg_t debug_waypoint[MAX_RISCV_PER_CORE]; @@ -234,13 +229,40 @@ struct watcher_msg_t { struct debug_ring_buf_msg_t debug_ring_buf; }; +struct dprint_buf_msg_t { + uint8_t data[DPRINT_BUFFERS_COUNT][DPRINT_BUFFER_SIZE]; + uint32_t pad; // to 1024 bytes +}; + + +// NOC aligment max from BH +static constexpr uint32_t TT_ARCH_MAX_NOC_WRITE_ALIGNMENT = 16; + +// TODO: when device specific headers specify number of processors +// (and hal abstracts them on host), get these from there (same as above for dprint) +#if defined(COMPILE_FOR_ERISC) || defined (COMPILE_FOR_IDLE_ERISC) +static constexpr uint32_t PROFILER_RISC_COUNT = 1; +#else +static constexpr uint32_t PROFILER_RISC_COUNT = 5; +#endif + +static constexpr uint32_t LAUNCH_NOC_ALIGMENT_PAD_COUNT = 1; +static constexpr uint32_t PROFILER_NOC_ALIGMENT_PAD_COUNT = 2; + +struct profiler_msg_t { + uint32_t control_vector[kernel_profiler::PROFILER_L1_CONTROL_VECTOR_SIZE]; + uint32_t buffer[PROFILER_RISC_COUNT][kernel_profiler::PROFILER_L1_VECTOR_SIZE]; +}; + struct mailboxes_t { struct ncrisc_halt_msg_t ncrisc_halt; struct slave_sync_msg_t slave_sync; - uint32_t pad; + uint32_t pads_1[LAUNCH_NOC_ALIGMENT_PAD_COUNT]; struct launch_msg_t launch; struct watcher_msg_t watcher; struct dprint_buf_msg_t dprint_buf; + uint32_t pads_2[PROFILER_NOC_ALIGMENT_PAD_COUNT]; + struct profiler_msg_t profiler; }; // Watcher struct needs to be 32b-divisible, since we need to write it from host using write_hex_vec_to_core(). @@ -249,15 +271,26 @@ static_assert(sizeof(kernel_config_msg_t) % sizeof(uint32_t) == 0); #ifndef TENSIX_FIRMWARE // Validate assumptions on mailbox layout on host compile -static_assert((MEM_MAILBOX_BASE + offsetof(mailboxes_t, launch)) % 32 == 0); -static_assert((eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE + offsetof(mailboxes_t, launch)) % 32 == 0); +// Constexpr definitions allow for printing of breaking values at compile time #ifdef NCRISC_HAS_IRAM // These are only used in ncrisc-halt.S static_assert(MEM_MAILBOX_BASE + offsetof(mailboxes_t, slave_sync.ncrisc) == MEM_SLAVE_RUN_MAILBOX_ADDRESS); static_assert( MEM_MAILBOX_BASE + offsetof(mailboxes_t, ncrisc_halt.stack_save) == MEM_NCRISC_HALT_STACK_MAILBOX_ADDRESS); #endif +#if defined(COMPILE_FOR_ERISC) || defined (COMPILE_FOR_IDLE_ERISC) +static_assert( eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE + sizeof(mailboxes_t) < eth_l1_mem::address_map::ERISC_MEM_MAILBOX_END); +static constexpr uint32_t ETH_LAUNCH_CHECK = (eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE + offsetof(mailboxes_t, launch)) % TT_ARCH_MAX_NOC_WRITE_ALIGNMENT; +static constexpr uint32_t ETH_PROFILER_CHECK = (eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE + offsetof(mailboxes_t, profiler)) % TT_ARCH_MAX_NOC_WRITE_ALIGNMENT; +static_assert( ETH_LAUNCH_CHECK == 0); +static_assert( ETH_PROFILER_CHECK == 0); +#else static_assert(MEM_MAILBOX_BASE + sizeof(mailboxes_t) < MEM_MAILBOX_END); +static constexpr uint32_t TENSIX_LAUNCH_CHECK = (MEM_MAILBOX_BASE + offsetof(mailboxes_t, launch)) % TT_ARCH_MAX_NOC_WRITE_ALIGNMENT; +static constexpr uint32_t TENSIX_PROFILER_CHECK = (MEM_MAILBOX_BASE + offsetof(mailboxes_t, profiler)) % TT_ARCH_MAX_NOC_WRITE_ALIGNMENT; +static_assert( TENSIX_LAUNCH_CHECK == 0); +static_assert( TENSIX_PROFILER_CHECK == 0); +#endif #endif struct eth_word_t { diff --git a/tt_metal/hw/inc/grayskull/dev_mem_map.h b/tt_metal/hw/inc/grayskull/dev_mem_map.h index 32ef00986e0..c5956136366 100644 --- a/tt_metal/hw/inc/grayskull/dev_mem_map.h +++ b/tt_metal/hw/inc/grayskull/dev_mem_map.h @@ -55,7 +55,8 @@ #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small -#define MEM_MAILBOX_END (MEM_MAILBOX_BASE + 1356) +#define MEM_MAILBOX_SIZE 5 * 2 * 1024 + 128 + 1364 +#define MEM_MAILBOX_END (MEM_MAILBOX_BASE + MEM_MAILBOX_SIZE) #define MEM_IERISC_MAILBOX_BASE 0 #define MEM_IERISC_MAILBOX_END 0 #define MEM_ZEROS_BASE ((MEM_MAILBOX_END + 31) & ~31) diff --git a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h index 90011ca7aa1..fe2993815c6 100644 --- a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h +++ b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h @@ -39,7 +39,5 @@ struct address_map { static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = 0; static constexpr std::int32_t ERISC_L1_TUNNEL_BUFFER_SIZE = 0; - static constexpr std::uint32_t PROFILER_L1_BUFFER_ER = 0; - static constexpr std::uint32_t PROFILER_L1_BUFFER_CONTROL = 0; }; } // namespace llk diff --git a/tt_metal/hw/inc/grayskull/noc/noc_parameters.h b/tt_metal/hw/inc/grayskull/noc/noc_parameters.h index 6d05cd49ce1..26acbdd3160 100644 --- a/tt_metal/hw/inc/grayskull/noc/noc_parameters.h +++ b/tt_metal/hw/inc/grayskull/noc/noc_parameters.h @@ -34,4 +34,6 @@ #define NOC_DRAM_READ_ALIGNMENT_BYTES 32 #define NOC_DRAM_WRITE_ALIGNMENT_BYTES 16 +#define L1_ALIGNMENT (static_cast(NOC_L1_READ_ALIGNMENT_BYTES >= NOC_L1_WRITE_ALIGNMENT_BYTES ? NOC_L1_READ_ALIGNMENT_BYTES : NOC_L1_WRITE_ALIGNMENT_BYTES)) + #endif diff --git a/tt_metal/hw/inc/wormhole/dev_mem_map.h b/tt_metal/hw/inc/wormhole/dev_mem_map.h index a4b8489cf00..89a40666601 100644 --- a/tt_metal/hw/inc/wormhole/dev_mem_map.h +++ b/tt_metal/hw/inc/wormhole/dev_mem_map.h @@ -57,7 +57,8 @@ #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small -#define MEM_MAILBOX_END (MEM_MAILBOX_BASE + 1356) +#define MEM_MAILBOX_SIZE 5 * 4 * 512 + 4 * 32 + 1364 +#define MEM_MAILBOX_END (MEM_MAILBOX_BASE + MEM_MAILBOX_SIZE) #define MEM_IERISC_MAILBOX_BASE 1024 #define MEM_IERISC_MAILBOX_END (MEM_IERISC_MAILBOX_BASE + 128) #define MEM_ZEROS_BASE ((MEM_MAILBOX_END + 31) & ~31) diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index 59469ab1c4e..cbd5b6f6f53 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -6,7 +6,7 @@ #include -#include "tt_metal/hostdevcommon/common_runtime_address_map.h" +#include "noc/noc_parameters.h" namespace eth_l1_mem { @@ -60,14 +60,9 @@ struct address_map { // erisc early exit functionality re-uses mailboxes_t::ncrisc_halt_msg_t::stack_save memory static constexpr std::int32_t ERISC_MEM_MAILBOX_STACK_SAVE = ERISC_MEM_MAILBOX_BASE + 4; - static constexpr std::uint32_t PROFILER_L1_BUFFER_ER = ERISC_MEM_MAILBOX_BASE + 288 + 256 + 16; - static constexpr std::uint32_t PROFILER_L1_BUFFER_CONTROL = PROFILER_L1_BUFFER_ER + PROFILER_L1_BUFFER_SIZE; - - static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_BASE = PROFILER_L1_BUFFER_CONTROL + PROFILER_L1_CONTROL_BUFFER_SIZE; - - static_assert((PROFILER_L1_BUFFER_ER % 32) == 0); - static_assert((PROFILER_L1_BUFFER_CONTROL % 32) == 0); + static constexpr std::uint32_t ERISC_MEM_MAILBOX_END = ERISC_MEM_MAILBOX_BASE + 288 + 256 + 16 + (512 + 32) * 4; + static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_BASE = ERISC_MEM_MAILBOX_END; static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE; static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_L1_LOADING_SIZE - ERISC_L1_UNRESERVED_BASE; diff --git a/tt_metal/hw/inc/wormhole/noc/noc_parameters.h b/tt_metal/hw/inc/wormhole/noc/noc_parameters.h index ee0c0b861d1..936193533d2 100644 --- a/tt_metal/hw/inc/wormhole/noc/noc_parameters.h +++ b/tt_metal/hw/inc/wormhole/noc/noc_parameters.h @@ -46,4 +46,6 @@ #define NOC_DRAM_READ_ALIGNMENT_BYTES 32 #define NOC_DRAM_WRITE_ALIGNMENT_BYTES 16 +#define L1_ALIGNMENT (static_cast(NOC_L1_READ_ALIGNMENT_BYTES >= NOC_L1_WRITE_ALIGNMENT_BYTES ? NOC_L1_READ_ALIGNMENT_BYTES : NOC_L1_WRITE_ALIGNMENT_BYTES)) + #endif diff --git a/tt_metal/impl/allocator/allocator_types.hpp b/tt_metal/impl/allocator/allocator_types.hpp index 596ff0f2b55..91134bd6e9f 100644 --- a/tt_metal/impl/allocator/allocator_types.hpp +++ b/tt_metal/impl/allocator/allocator_types.hpp @@ -57,7 +57,8 @@ enum class MemoryAllocator { L1_BANKING = 1, }; -constexpr static std::uint32_t STORAGE_ONLY_RESERVED_SIZE = ((MEM_MAILBOX_END + ALLOCATOR_ALIGNMENT - 1) / ALLOCATOR_ALIGNMENT) * ALLOCATOR_ALIGNMENT; +constexpr static std::uint32_t STORAGE_ONLY_RESERVED_SIZE = ((MEM_MAILBOX_BASE + ALLOCATOR_ALIGNMENT - 1) / ALLOCATOR_ALIGNMENT) * ALLOCATOR_ALIGNMENT; + // Storage only cores only need to reserve mailbox space to hold barriers constexpr static std::uint32_t STORAGE_ONLY_UNRESERVED_BASE = STORAGE_ONLY_RESERVED_SIZE; diff --git a/tt_metal/impl/dispatch/kernels/cq_common.hpp b/tt_metal/impl/dispatch/kernels/cq_common.hpp index f398ce45074..53f91b16a44 100644 --- a/tt_metal/impl/dispatch/kernels/cq_common.hpp +++ b/tt_metal/impl/dispatch/kernels/cq_common.hpp @@ -261,10 +261,15 @@ FORCE_INLINE void cb_wait_all_pages(uint32_t n) { volatile tt_l1_ptr uint32_t* sem_addr = reinterpret_cast(get_semaphore(sem_id)); + + // Downstream component sets the MSB as a terminate bit + // Mask that off to avoid a race between the sem count and terminate + n &= 0x7fffffff; + WAYPOINT("TAPW"); do { invalidate_l1_cache(); - } while ((*sem_addr) != n); + } while ((*sem_addr & 0x7fffffff) != n); // mask off terminate bit WAYPOINT("TAPD"); } diff --git a/tt_metal/llrt/blackhole/bh_hal_active_eth_mem_map.cpp b/tt_metal/llrt/blackhole/bh_hal_active_eth_mem_map.cpp index a168a04cc7d..2ee55149640 100644 --- a/tt_metal/llrt/blackhole/bh_hal_active_eth_mem_map.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_active_eth_mem_map.cpp @@ -35,6 +35,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_bases[hv(HalMemAddrType::LAUNCH)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[hv(HalMemAddrType::WATCHER)] = GET_ETH_MAILBOX_ADDRESS_HOST(watcher); mem_map_bases[hv(HalMemAddrType::DPRINT)] = GET_ETH_MAILBOX_ADDRESS_HOST(dprint_buf); + mem_map_bases[hv(HalMemAddrType::PROFILER)] = GET_ETH_MAILBOX_ADDRESS_HOST(profiler); mem_map_bases[hv(HalMemAddrType::KERNEL_CONFIG)] = eth_l1_mem::address_map::ERISC_L1_KERNEL_CONFIG_BASE; mem_map_bases[hv(HalMemAddrType::UNRESERVED)] = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; @@ -44,6 +45,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[hv(HalMemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[hv(HalMemAddrType::WATCHER)] = sizeof(watcher_msg_t); mem_map_sizes[hv(HalMemAddrType::DPRINT)] = sizeof(dprint_buf_msg_t); + mem_map_sizes[hv(HalMemAddrType::PROFILER)] = sizeof(profiler_msg_t); mem_map_sizes[hv(HalMemAddrType::KERNEL_CONFIG)] = eth_l1_mem::address_map::ERISC_L1_KERNEL_CONFIG_SIZE; mem_map_sizes[hv(HalMemAddrType::UNRESERVED)] = eth_l1_mem::address_map::MAX_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; diff --git a/tt_metal/llrt/blackhole/bh_hal_idle_eth_mem_map.cpp b/tt_metal/llrt/blackhole/bh_hal_idle_eth_mem_map.cpp index ffd51aa0153..5a42cdb6e06 100644 --- a/tt_metal/llrt/blackhole/bh_hal_idle_eth_mem_map.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_idle_eth_mem_map.cpp @@ -34,6 +34,7 @@ HalCoreInfoType create_idle_eth_mem_map() { mem_map_bases[hv(HalMemAddrType::LAUNCH)] = GET_IERISC_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[hv(HalMemAddrType::WATCHER)] = GET_IERISC_MAILBOX_ADDRESS_HOST(watcher); mem_map_bases[hv(HalMemAddrType::DPRINT)] = GET_IERISC_MAILBOX_ADDRESS_HOST(dprint_buf); + mem_map_bases[hv(HalMemAddrType::PROFILER)] = GET_IERISC_MAILBOX_ADDRESS_HOST(profiler); mem_map_bases[hv(HalMemAddrType::KERNEL_CONFIG)] = IDLE_ERISC_L1_KERNEL_CONFIG_BASE; mem_map_bases[hv(HalMemAddrType::UNRESERVED)] = ERISC_L1_UNRESERVED_BASE; @@ -43,6 +44,7 @@ HalCoreInfoType create_idle_eth_mem_map() { mem_map_sizes[hv(HalMemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[hv(HalMemAddrType::WATCHER)] = sizeof(watcher_msg_t); mem_map_sizes[hv(HalMemAddrType::DPRINT)] = sizeof(dprint_buf_msg_t); + mem_map_sizes[hv(HalMemAddrType::PROFILER)] = sizeof(profiler_msg_t); mem_map_sizes[hv(HalMemAddrType::KERNEL_CONFIG)] = L1_KERNEL_CONFIG_SIZE; // TODO: this is wrong, need idle eth specific value mem_map_sizes[hv(HalMemAddrType::UNRESERVED)] = MEM_ETH_SIZE - ERISC_L1_UNRESERVED_BASE; diff --git a/tt_metal/llrt/blackhole/bh_hal_tensix_mem_map.cpp b/tt_metal/llrt/blackhole/bh_hal_tensix_mem_map.cpp index c96dcae10b7..e55d223985f 100644 --- a/tt_metal/llrt/blackhole/bh_hal_tensix_mem_map.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_tensix_mem_map.cpp @@ -32,6 +32,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_bases[hv(HalMemAddrType::LAUNCH)] = GET_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[hv(HalMemAddrType::WATCHER)] = GET_MAILBOX_ADDRESS_HOST(watcher); mem_map_bases[hv(HalMemAddrType::DPRINT)] = GET_MAILBOX_ADDRESS_HOST(dprint_buf); + mem_map_bases[hv(HalMemAddrType::PROFILER)] = GET_MAILBOX_ADDRESS_HOST(profiler); mem_map_bases[hv(HalMemAddrType::KERNEL_CONFIG)] = L1_KERNEL_CONFIG_BASE; mem_map_bases[hv(HalMemAddrType::UNRESERVED)] = L1_UNRESERVED_BASE; @@ -41,6 +42,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_sizes[hv(HalMemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[hv(HalMemAddrType::WATCHER)] = sizeof(watcher_msg_t); mem_map_sizes[hv(HalMemAddrType::DPRINT)] = sizeof(dprint_buf_msg_t); + mem_map_sizes[hv(HalMemAddrType::PROFILER)] = sizeof(profiler_msg_t); mem_map_sizes[hv(HalMemAddrType::KERNEL_CONFIG)] = L1_KERNEL_CONFIG_SIZE; mem_map_sizes[hv(HalMemAddrType::UNRESERVED)] = MEM_L1_SIZE - L1_UNRESERVED_BASE; diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp index 853e54c1141..275d24ff895 100644 --- a/tt_metal/llrt/grayskull/gs_hal.cpp +++ b/tt_metal/llrt/grayskull/gs_hal.cpp @@ -37,6 +37,7 @@ void Hal::initialize_gs() { mem_map_bases[hv(HalMemAddrType::LAUNCH)] = GET_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[hv(HalMemAddrType::WATCHER)] = GET_MAILBOX_ADDRESS_HOST(watcher); mem_map_bases[hv(HalMemAddrType::DPRINT)] = GET_MAILBOX_ADDRESS_HOST(dprint_buf); + mem_map_bases[hv(HalMemAddrType::PROFILER)] = GET_MAILBOX_ADDRESS_HOST(profiler); mem_map_bases[hv(HalMemAddrType::KERNEL_CONFIG)] = L1_KERNEL_CONFIG_BASE; mem_map_bases[hv(HalMemAddrType::UNRESERVED)] = L1_UNRESERVED_BASE; @@ -46,6 +47,7 @@ void Hal::initialize_gs() { mem_map_sizes[hv(HalMemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[hv(HalMemAddrType::WATCHER)] = sizeof(watcher_msg_t); mem_map_sizes[hv(HalMemAddrType::DPRINT)] = sizeof(dprint_buf_msg_t); + mem_map_sizes[hv(HalMemAddrType::PROFILER)] = sizeof(profiler_msg_t); mem_map_sizes[hv(HalMemAddrType::KERNEL_CONFIG)] = L1_KERNEL_CONFIG_SIZE; mem_map_sizes[hv(HalMemAddrType::UNRESERVED)] = MEM_L1_SIZE - L1_UNRESERVED_BASE; diff --git a/tt_metal/llrt/wormhole/wh_hal_active_eth_mem_map.cpp b/tt_metal/llrt/wormhole/wh_hal_active_eth_mem_map.cpp index 751f4e06e13..f500d069ae2 100644 --- a/tt_metal/llrt/wormhole/wh_hal_active_eth_mem_map.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_active_eth_mem_map.cpp @@ -35,6 +35,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_bases[hv(HalMemAddrType::LAUNCH)] = GET_ETH_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[hv(HalMemAddrType::WATCHER)] = GET_ETH_MAILBOX_ADDRESS_HOST(watcher); mem_map_bases[hv(HalMemAddrType::DPRINT)] = GET_ETH_MAILBOX_ADDRESS_HOST(dprint_buf); + mem_map_bases[hv(HalMemAddrType::PROFILER)] = GET_ETH_MAILBOX_ADDRESS_HOST(profiler); mem_map_bases[hv(HalMemAddrType::KERNEL_CONFIG)] = eth_l1_mem::address_map::ERISC_L1_KERNEL_CONFIG_BASE; mem_map_bases[hv(HalMemAddrType::UNRESERVED)] = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; @@ -44,6 +45,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[hv(HalMemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[hv(HalMemAddrType::WATCHER)] = sizeof(watcher_msg_t); mem_map_sizes[hv(HalMemAddrType::DPRINT)] = sizeof(dprint_buf_msg_t); + mem_map_sizes[hv(HalMemAddrType::PROFILER)] = sizeof(profiler_msg_t); mem_map_sizes[hv(HalMemAddrType::KERNEL_CONFIG)] = eth_l1_mem::address_map::ERISC_L1_KERNEL_CONFIG_SIZE; mem_map_sizes[hv(HalMemAddrType::UNRESERVED)] = eth_l1_mem::address_map::MAX_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; diff --git a/tt_metal/llrt/wormhole/wh_hal_idle_eth_mem_map.cpp b/tt_metal/llrt/wormhole/wh_hal_idle_eth_mem_map.cpp index 9e9c8afaa19..6b15ad70305 100644 --- a/tt_metal/llrt/wormhole/wh_hal_idle_eth_mem_map.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_idle_eth_mem_map.cpp @@ -34,6 +34,7 @@ HalCoreInfoType create_idle_eth_mem_map() { mem_map_bases[hv(HalMemAddrType::LAUNCH)] = GET_IERISC_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[hv(HalMemAddrType::WATCHER)] = GET_IERISC_MAILBOX_ADDRESS_HOST(watcher); mem_map_bases[hv(HalMemAddrType::DPRINT)] = GET_IERISC_MAILBOX_ADDRESS_HOST(dprint_buf); + mem_map_bases[hv(HalMemAddrType::PROFILER)] = GET_IERISC_MAILBOX_ADDRESS_HOST(profiler); mem_map_bases[hv(HalMemAddrType::KERNEL_CONFIG)] = IDLE_ERISC_L1_KERNEL_CONFIG_BASE; mem_map_bases[hv(HalMemAddrType::UNRESERVED)] = ERISC_L1_UNRESERVED_BASE; @@ -43,6 +44,7 @@ HalCoreInfoType create_idle_eth_mem_map() { mem_map_sizes[hv(HalMemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[hv(HalMemAddrType::WATCHER)] = sizeof(watcher_msg_t); mem_map_sizes[hv(HalMemAddrType::DPRINT)] = sizeof(dprint_buf_msg_t); + mem_map_sizes[hv(HalMemAddrType::PROFILER)] = sizeof(profiler_msg_t); mem_map_sizes[hv(HalMemAddrType::KERNEL_CONFIG)] = L1_KERNEL_CONFIG_SIZE; // TODO: this is wrong, need idle eth specific value mem_map_sizes[hv(HalMemAddrType::UNRESERVED)] = MEM_ETH_SIZE - ERISC_L1_UNRESERVED_BASE; diff --git a/tt_metal/llrt/wormhole/wh_hal_tensix_mem_map.cpp b/tt_metal/llrt/wormhole/wh_hal_tensix_mem_map.cpp index e104f6fce82..596688fd528 100644 --- a/tt_metal/llrt/wormhole/wh_hal_tensix_mem_map.cpp +++ b/tt_metal/llrt/wormhole/wh_hal_tensix_mem_map.cpp @@ -32,6 +32,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_bases[hv(HalMemAddrType::LAUNCH)] = GET_MAILBOX_ADDRESS_HOST(launch); mem_map_bases[hv(HalMemAddrType::WATCHER)] = GET_MAILBOX_ADDRESS_HOST(watcher); mem_map_bases[hv(HalMemAddrType::DPRINT)] = GET_MAILBOX_ADDRESS_HOST(dprint_buf); + mem_map_bases[hv(HalMemAddrType::PROFILER)] = GET_MAILBOX_ADDRESS_HOST(profiler); mem_map_bases[hv(HalMemAddrType::KERNEL_CONFIG)] = L1_KERNEL_CONFIG_BASE; mem_map_bases[hv(HalMemAddrType::UNRESERVED)] = L1_UNRESERVED_BASE; @@ -41,6 +42,7 @@ HalCoreInfoType create_tensix_mem_map() { mem_map_sizes[hv(HalMemAddrType::LAUNCH)] = sizeof(launch_msg_t); mem_map_sizes[hv(HalMemAddrType::WATCHER)] = sizeof(watcher_msg_t); mem_map_sizes[hv(HalMemAddrType::DPRINT)] = sizeof(dprint_buf_msg_t); + mem_map_sizes[hv(HalMemAddrType::PROFILER)] = sizeof(profiler_msg_t); mem_map_sizes[hv(HalMemAddrType::KERNEL_CONFIG)] = L1_KERNEL_CONFIG_SIZE; mem_map_sizes[hv(HalMemAddrType::UNRESERVED)] = MEM_L1_SIZE - L1_UNRESERVED_BASE; diff --git a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp index 997d84ea541..b3e7a629d6d 100644 --- a/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp +++ b/tt_metal/programming_examples/profiler/test_full_buffer/test_full_buffer.cpp @@ -5,6 +5,7 @@ #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" #include "tt_metal/impl/device/device.hpp" +#include "tt_metal/hostdevcommon/profiler_common.h" using namespace tt; @@ -48,7 +49,7 @@ void RunFillUpAllBuffers(tt_metal::Device *device, int loop_count, bool fast_dis if (fast_dispatch) { - for (int i = 0; i < PROFILER_OP_SUPPORT_COUNT * PROFILER_L1_GUARANTEED_MARKER_COUNT / loop_count; i++) + for (int i = 0; i < PROFILER_OP_SUPPORT_COUNT * kernel_profiler::PROFILER_L1_GUARANTEED_MARKER_COUNT / loop_count; i++) { EnqueueProgram(device->command_queue(), program, false); } diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 0072e5151c7..108ba6ab8f2 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -16,6 +16,8 @@ #include "hostdevcommon/profiler_common.h" #include "risc_attribs.h" +#include "dev_msgs.h" + #define DO_PRAGMA(x) _Pragma (#x) #define Stringize( L ) #L @@ -25,6 +27,8 @@ #define PROFILER_MSG __FILE__ "," $Line ",KERNEL_PROFILER" #define PROFILER_MSG_NAME( name ) name "," PROFILER_MSG +#define SrcLocNameToHash( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME( name )); + #if defined(PROFILE_KERNEL) && ( !defined(DISPATCH_KERNEL) || (defined(DISPATCH_KERNEL) && defined(COMPILE_FOR_NCRISC) && (PROFILE_KERNEL == PROFILER_OPT_DO_DISPATCH_CORES))) namespace kernel_profiler{ @@ -38,33 +42,27 @@ namespace kernel_profiler{ constexpr int WALL_CLOCK_HIGH_INDEX = 1; constexpr int WALL_CLOCK_LOW_INDEX = 0; + volatile tt_l1_ptr uint32_t *profiler_control_buffer = + reinterpret_cast(GET_MAILBOX_ADDRESS_DEV(profiler.control_vector)); + + volatile tt_l1_ptr uint32_t (*profiler_data_buffer)[kernel_profiler::PROFILER_L1_VECTOR_SIZE] = + reinterpret_cast(GET_MAILBOX_ADDRESS_DEV(profiler.buffer)); + #if defined(COMPILE_FOR_BRISC) - constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_BR; - constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_BR; - volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + constexpr uint32_t myRiscID = 0; extern uint16_t core_flat_id; #elif defined(COMPILE_FOR_ERISC) - constexpr uint32_t profilerBuffer = eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER; - constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_ER; - volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL); + constexpr uint32_t myRiscID = 0; extern uint16_t core_flat_id; #elif defined(COMPILE_FOR_NCRISC) - constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_NC; - constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_NC; - volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + constexpr uint32_t myRiscID = 1; extern uint16_t core_flat_id; #elif COMPILE_FOR_TRISC == 0 - constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_T0; - constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_T0; - volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + constexpr uint32_t myRiscID = 2; #elif COMPILE_FOR_TRISC == 1 - constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_T1; - constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_T1; - volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + constexpr uint32_t myRiscID = 3; #elif COMPILE_FOR_TRISC == 2 - constexpr uint32_t profilerBuffer = PROFILER_L1_BUFFER_T2; - constexpr uint32_t deviceBufferEndIndex = DEVICE_BUFFER_END_INDEX_T2; - volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); + constexpr uint32_t myRiscID = 4; #endif constexpr uint32_t Hash32_CT( const char * str, size_t n, uint32_t basis = UINT32_C( 2166136261 ) ) { @@ -77,8 +75,6 @@ namespace kernel_profiler{ return ((res & 0xFFFF) ^ ((res & 0xFFFF0000) >> 16)) & 0xFFFF; } -#define SrcLocNameToHash( name ) DO_PRAGMA(message(PROFILER_MSG_NAME(name))); auto constexpr hash = kernel_profiler::Hash16_CT(PROFILER_MSG_NAME( name )); - __attribute__((noinline)) void init_profiler(uint16_t briscKernelID = 0, uint16_t ncriscKernelID = 0, uint16_t triscsKernelID = 0) { wIndex = CUSTOM_MARKERS; @@ -94,84 +90,34 @@ namespace kernel_profiler{ uint32_t runCounter = profiler_control_buffer[RUN_COUNTER]; profiler_control_buffer[PROFILER_DONE] = 0; -#if defined(COMPILE_FOR_ERISC) - volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER); - if (runCounter == 0) { core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; - for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) + for (uint32_t riscID = 0; riscID < PROFILER_RISC_COUNT; riscID ++) { - eriscBuffer[i] = 0; - } + for (uint32_t i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) + { + profiler_data_buffer[riscID][i] = 0; + } - eriscBuffer [ID_LH] = ((core_flat_id & 0xFF) << 3) | 0; + profiler_data_buffer[riscID][ID_LH] = ((core_flat_id & 0xFF) << 3) | riscID; + } profiler_control_buffer[NOC_X] = my_x[0]; profiler_control_buffer[NOC_Y] = my_y[0]; profiler_control_buffer[FLAT_ID] = core_flat_id; } - for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) - { - //TODO(MO): Clean up magic numbers - eriscBuffer[i] = 0x80000000; - } - - eriscBuffer [ID_LL] = (runCounter & 0xFFFF) | (eriscBuffer [ID_LL] & 0xFFFF0000); - -#endif //ERISC_INIT -#if defined(COMPILE_FOR_BRISC) - - volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR); - volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_NC); - volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T0); - volatile tt_l1_ptr uint32_t *trisc1Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T1); - volatile tt_l1_ptr uint32_t *trisc2Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T2); - - if (runCounter == 0) + for (uint32_t riscID = 0; riscID < PROFILER_RISC_COUNT; riscID ++) { - core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; - - for (int i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) + for (uint32_t i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) { - briscBuffer[i] = 0; - ncriscBuffer[i] = 0; - trisc0Buffer[i] = 0; - trisc1Buffer[i] = 0; - trisc2Buffer[i] = 0; + //TODO(MO): Clean up magic numbers + profiler_data_buffer[riscID][i] = 0x80000000; } - - briscBuffer [ID_LH] = ((core_flat_id & 0xFF) << 3) | 0; - ncriscBuffer[ID_LH] = ((core_flat_id & 0xFF) << 3) | 1; - trisc0Buffer[ID_LH] = ((core_flat_id & 0xFF) << 3) | 2; - trisc1Buffer[ID_LH] = ((core_flat_id & 0xFF) << 3) | 3; - trisc2Buffer[ID_LH] = ((core_flat_id & 0xFF) << 3) | 4; - - profiler_control_buffer[NOC_X] = my_x[0]; - profiler_control_buffer[NOC_Y] = my_y[0]; - profiler_control_buffer[FLAT_ID] = core_flat_id; - } - - for (int i = GUARANTEED_MARKER_1_H; i < CUSTOM_MARKERS; i ++) - { - //TODO(MO): Clean up magic numbers - briscBuffer[i] = 0x80000000; - ncriscBuffer[i] = 0x80000000; - trisc0Buffer[i] = 0x80000000; - trisc1Buffer[i] = 0x80000000; - trisc2Buffer[i] = 0x80000000; + profiler_data_buffer[riscID][ID_LL] = (runCounter & 0xFFFF) | (profiler_data_buffer[riscID][ID_LL] & 0xFFFF0000); } - - briscBuffer [ID_LL] = (runCounter & 0xFFFF) | (briscBuffer [ID_LL] & 0xFFFF0000); - ncriscBuffer[ID_LL] = (runCounter & 0xFFFF) | (ncriscBuffer[ID_LL] & 0xFFFF0000); - trisc0Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc0Buffer[ID_LL] & 0xFFFF0000); - trisc1Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc1Buffer[ID_LL] & 0xFFFF0000); - trisc2Buffer[ID_LL] = (runCounter & 0xFFFF) | (trisc2Buffer[ID_LL] & 0xFFFF0000); - - -#endif //BRISC_INIT #endif } @@ -187,19 +133,17 @@ namespace kernel_profiler{ inline __attribute__((always_inline)) void mark_time_at_index_inlined(uint32_t index, uint32_t timer_id) { - volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); - buffer[index] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF); - buffer[index+1] = p_reg[WALL_CLOCK_LOW_INDEX]; + profiler_data_buffer[myRiscID][index] = 0x80000000 | ((timer_id & 0x7FFFF) << 12) | (p_reg[WALL_CLOCK_HIGH_INDEX] & 0xFFF); + profiler_data_buffer[myRiscID][index+1] = p_reg[WALL_CLOCK_LOW_INDEX]; } inline __attribute__((always_inline)) void mark_padding() { if (wIndex < PROFILER_L1_VECTOR_SIZE) { - volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); - buffer[wIndex] = 0x80000000; - buffer[wIndex+1] = 0; + profiler_data_buffer[myRiscID][wIndex] = 0x80000000; + profiler_data_buffer[myRiscID][wIndex+1] = 0; wIndex += PROFILER_L1_MARKER_UINT32_SIZE; } } @@ -212,25 +156,10 @@ namespace kernel_profiler{ inline __attribute__((always_inline)) void set_host_counter(uint32_t counterValue) { -#if defined(COMPILE_FOR_ERISC) - volatile tt_l1_ptr uint32_t *eriscBuffer = reinterpret_cast(eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER); - - eriscBuffer[ID_LL] = (counterValue << 16) | (eriscBuffer[ID_LL] & 0xFFFF); -#endif - -#if defined(COMPILE_FOR_BRISC) - volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR); - volatile tt_l1_ptr uint32_t *ncriscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_NC); - volatile tt_l1_ptr uint32_t *trisc0Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T0); - volatile tt_l1_ptr uint32_t *trisc1Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T1); - volatile tt_l1_ptr uint32_t *trisc2Buffer = reinterpret_cast(PROFILER_L1_BUFFER_T2); - - briscBuffer[ID_LL] = (counterValue << 16) | (briscBuffer[ID_LL] & 0xFFFF); - ncriscBuffer[ID_LL] = (counterValue << 16) | (ncriscBuffer[ID_LL] & 0xFFFF); - trisc0Buffer[ID_LL] = (counterValue << 16) | (trisc0Buffer[ID_LL] & 0xFFFF); - trisc1Buffer[ID_LL] = (counterValue << 16) | (trisc1Buffer[ID_LL] & 0xFFFF); - trisc2Buffer[ID_LL] = (counterValue << 16) | (trisc2Buffer[ID_LL] & 0xFFFF); -#endif + for (uint32_t riscID = 0; riscID < PROFILER_RISC_COUNT; riscID ++) + { + profiler_data_buffer[riscID][ID_LL] = (counterValue << 16) | (profiler_data_buffer[riscID][ID_LL] & 0xFFFF); + } } inline __attribute__((always_inline)) void risc_finished_profiling() @@ -241,9 +170,8 @@ namespace kernel_profiler{ { if (wIndex < PROFILER_L1_VECTOR_SIZE) { - volatile tt_l1_ptr uint32_t *buffer = reinterpret_cast(kernel_profiler::profilerBuffer); - buffer[wIndex] = 0x80000000 | ((get_sum_id(sumIDs[i]) & 0x7FFFF) << 12); - buffer[wIndex + 1] = sums[i]; + profiler_data_buffer[myRiscID][wIndex] = 0x80000000 | ((get_sum_id(sumIDs[i]) & 0x7FFFF) << 12); + profiler_data_buffer[myRiscID][wIndex + 1] = sums[i]; wIndex += PROFILER_L1_MARKER_UINT32_SIZE; } } @@ -253,7 +181,7 @@ namespace kernel_profiler{ { mark_padding(); } - profiler_control_buffer[kernel_profiler::deviceBufferEndIndex] = wIndex; + profiler_control_buffer[kernel_profiler::DEVICE_BUFFER_END_INDEX_BR_ER + myRiscID] = wIndex; } __attribute__((noinline)) void finish_profiler() @@ -264,50 +192,15 @@ namespace kernel_profiler{ return; } uint32_t pageSize = - PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * PROFILER_RISC_COUNT * profiler_core_count_per_dram; + PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * MAX_RISCV_PER_CORE * profiler_core_count_per_dram; while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); uint32_t dram_profiler_address = profiler_control_buffer[DRAM_PROFILER_ADDRESS]; -#if defined(COMPILE_FOR_ERISC) - int hostIndex = HOST_BUFFER_END_INDEX_ER; - int deviceIndex = DEVICE_BUFFER_END_INDEX_ER; - uint32_t currEndIndex = - profiler_control_buffer[deviceIndex] + - profiler_control_buffer[hostIndex]; - - uint32_t dram_offset = - (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - profiler_control_buffer[hostIndex] * sizeof(uint32_t); - - const InterleavedAddrGen s = { - .bank_base_address = dram_profiler_address, - .page_size = pageSize - }; - - if ( currEndIndex < PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) - { - uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); - - noc_async_write( - eth_l1_mem::address_map::PROFILER_L1_BUFFER_ER, - dram_bank_dst_noc_addr, - profiler_control_buffer[deviceIndex] * sizeof(uint32_t)); - - profiler_control_buffer[hostIndex] = currEndIndex; - } - else - { - mark_dropped_timestamps(hostIndex); - } -#endif -#if defined(COMPILE_FOR_BRISC) - int hostIndex; - int deviceIndex; - for (hostIndex = kernel_profiler::HOST_BUFFER_END_INDEX_BR, deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR; - (hostIndex <= kernel_profiler::HOST_BUFFER_END_INDEX_T2) && (deviceIndex <= kernel_profiler::DEVICE_BUFFER_END_INDEX_T2); - hostIndex++, deviceIndex++) + for (uint32_t riscID = 0; riscID < PROFILER_RISC_COUNT; riscID ++) { + int hostIndex = riscID; + int deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR_ER + riscID; if (profiler_control_buffer[deviceIndex]) { uint32_t currEndIndex = @@ -320,7 +213,7 @@ namespace kernel_profiler{ if (currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) { dram_offset = - (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + (core_flat_id % profiler_core_count_per_dram) * MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + profiler_control_buffer[hostIndex] * sizeof(uint32_t); @@ -333,7 +226,7 @@ namespace kernel_profiler{ { dram_offset = (core_flat_id % profiler_core_count_per_dram) * - PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + hostIndex * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC; send_size = CUSTOM_MARKERS * sizeof(uint32_t); @@ -354,14 +247,14 @@ namespace kernel_profiler{ uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); noc_async_write( - PROFILER_L1_BUFFER_BR + hostIndex * PROFILER_L1_BUFFER_SIZE, + reinterpret_cast(profiler_data_buffer[hostIndex]), dram_bank_dst_noc_addr, send_size); } profiler_control_buffer[deviceIndex] = 0; } } -#endif + noc_async_write_barrier(); profiler_control_buffer[RUN_COUNTER] ++; profiler_control_buffer[PROFILER_DONE] = 1; @@ -377,14 +270,14 @@ namespace kernel_profiler{ core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; uint32_t dram_offset = - (core_flat_id % profiler_core_count_per_dram) * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - HOST_BUFFER_END_INDEX_NC * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + - profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] * sizeof(uint32_t); + (core_flat_id % profiler_core_count_per_dram) * MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + (HOST_BUFFER_END_INDEX_BR_ER + myRiscID) * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + + profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] * sizeof(uint32_t); while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); const InterleavedAddrGen s = { .bank_base_address = profiler_control_buffer[DRAM_PROFILER_ADDRESS], - .page_size = PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * PROFILER_RISC_COUNT * profiler_core_count_per_dram + .page_size = PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * MAX_RISCV_PER_CORE * profiler_core_count_per_dram }; uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); @@ -392,21 +285,21 @@ namespace kernel_profiler{ mark_time_at_index_inlined(wIndex, get_end_timer_id(hash)); wIndex += PROFILER_L1_MARKER_UINT32_SIZE; - uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] + wIndex; + uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] + wIndex; if ( currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) { noc_async_write( - PROFILER_L1_BUFFER_NC, + reinterpret_cast(profiler_data_buffer[myRiscID]), dram_bank_dst_noc_addr, wIndex * sizeof(uint32_t)); - profiler_control_buffer[HOST_BUFFER_END_INDEX_NC] = currEndIndex; + profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] = currEndIndex; } else { - mark_dropped_timestamps(HOST_BUFFER_END_INDEX_NC); + mark_dropped_timestamps(HOST_BUFFER_END_INDEX_BR_ER + myRiscID); } wIndex = CUSTOM_MARKERS; diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index 11a1390f28a..0a6b11dc9cd 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -14,6 +14,7 @@ #include "tools/profiler/common.hpp" #include "hostdevcommon/profiler_common.h" #include "llrt/rtoptions.hpp" +#include "dev_msgs.h" #include "tt_metal/third_party/tracy/public/tracy/Tracy.hpp" #include "tt_metal/impl/device/device.hpp" @@ -29,53 +30,52 @@ void DeviceProfiler::readRiscProfilerResults( ZoneScoped; - std::pair deviceCore = {device_id,worker_core}; + HalProgrammableCoreType CoreType; + int riscCount; + profiler_msg_t *profiler_msg; const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device_id); - uint32_t coreFlatID = soc_d.physical_routing_to_profiler_flat_id.at(worker_core); - uint32_t startIndex = coreFlatID * PROFILER_RISC_COUNT * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC; - - vector control_buffer; - auto ethCores = soc_d.get_physical_ethernet_cores() ; - - std::vector riscEndIndices; - if (std::find(ethCores.begin(), ethCores.end(), worker_core) == ethCores.end()) { - control_buffer = tt::llrt::read_hex_vec_from_core( - device_id, - worker_core, - PROFILER_L1_BUFFER_CONTROL, - PROFILER_L1_CONTROL_BUFFER_SIZE); - - riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_BR); - riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_NC); - riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_T0); - riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_T1); - riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_T2); + profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::PROFILER); + CoreType = HalProgrammableCoreType::TENSIX; + riscCount = 5; } else { - control_buffer = tt::llrt::read_hex_vec_from_core( - device_id, - worker_core, - eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL, - PROFILER_L1_CONTROL_BUFFER_SIZE); - - riscEndIndices.push_back(kernel_profiler::HOST_BUFFER_END_INDEX_ER); + profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalMemAddrType::PROFILER); + CoreType = HalProgrammableCoreType::ACTIVE_ETH; + riscCount = 1; } - if ((control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_BR] == 0) && - (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_NC] == 0) && - (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_ER] == 0)) + uint32_t coreFlatID = soc_d.physical_routing_to_profiler_flat_id.at(worker_core); + uint32_t startIndex = coreFlatID * MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC; + + vector control_buffer = tt::llrt::read_hex_vec_from_core( + device_id, + worker_core, + reinterpret_cast(profiler_msg->control_vector), + kernel_profiler::PROFILER_L1_CONTROL_BUFFER_SIZE); + + if ((control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_BR_ER] == 0) && + (control_buffer[kernel_profiler::HOST_BUFFER_END_INDEX_NC] == 0)) { return; } int riscNum = 0; - for (auto riscEndIndex : riscEndIndices) { + for (int riscEndIndex = 0; riscEndIndex < riscCount; riscEndIndex ++ ) { uint32_t bufferEndIndex = control_buffer[riscEndIndex]; + uint32_t riscType; + if(CoreType == HalProgrammableCoreType::TENSIX) + { + riscType = riscEndIndex; + } + else + { + riscType = 5; + } if (bufferEndIndex > 0) { uint32_t bufferRiscShift = riscNum * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC + startIndex; @@ -95,7 +95,7 @@ void DeviceProfiler::readRiscProfilerResults( uint32_t opTime_H = 0; uint32_t opTime_L = 0; - for (int index = bufferRiscShift; index < (bufferRiscShift + bufferEndIndex); index += PROFILER_L1_MARKER_UINT32_SIZE) + for (int index = bufferRiscShift; index < (bufferRiscShift + bufferEndIndex); index += kernel_profiler::PROFILER_L1_MARKER_UINT32_SIZE) { if (!newRunStart && profile_buffer[index] == 0 && profile_buffer[index + 1] == 0) { @@ -156,7 +156,7 @@ void DeviceProfiler::readRiscProfilerResults( device_id, worker_core, coreFlatID, - riscEndIndex, + riscType, 0, marker, (uint64_t(time_H) << 32) | time_L); @@ -175,7 +175,7 @@ void DeviceProfiler::readRiscProfilerResults( device_id, worker_core, coreFlatID, - riscEndIndex, + riscType, sum, marker, (uint64_t(time_H) << 32) | time_L); @@ -186,14 +186,15 @@ void DeviceProfiler::readRiscProfilerResults( riscNum ++; } - std::vector control_buffer_reset(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + std::vector control_buffer_reset(kernel_profiler::PROFILER_L1_CONTROL_VECTOR_SIZE, 0); control_buffer_reset[kernel_profiler::DRAM_PROFILER_ADDRESS] = output_dram_buffer->address(); + profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::PROFILER); tt::llrt::write_hex_vec_to_core( device_id, worker_core, control_buffer_reset, - PROFILER_L1_BUFFER_CONTROL); + reinterpret_cast(profiler_msg->control_vector)); } void DeviceProfiler::firstTimestamp(uint64_t timestamp) diff --git a/tt_metal/tools/profiler/sync/sync_kernel.cpp b/tt_metal/tools/profiler/sync/sync_kernel.cpp index 12e1abe72cd..0ef1ff110ae 100644 --- a/tt_metal/tools/profiler/sync/sync_kernel.cpp +++ b/tt_metal/tools/profiler/sync/sync_kernel.cpp @@ -7,8 +7,13 @@ void kernel_main() { DeviceZoneScopedMainN("SYNC-MAIN"); volatile tt_reg_ptr uint32_t *p_reg = reinterpret_cast (RISCV_DEBUG_REG_WALL_CLOCK_L); - volatile tt_l1_ptr uint32_t *profiler_control_buffer = reinterpret_cast(PROFILER_L1_BUFFER_CONTROL); - volatile tt_l1_ptr uint32_t *briscBuffer = reinterpret_cast(PROFILER_L1_BUFFER_BR + kernel_profiler::CUSTOM_MARKERS * sizeof(uint32_t)); + + volatile tt_l1_ptr uint32_t *profiler_control_buffer = + reinterpret_cast(((mailboxes_t *)MEM_MAILBOX_BASE)->profiler.control_vector); + + constexpr uint32_t briscIndex = 0; + volatile tt_l1_ptr uint32_t *briscBuffer = + reinterpret_cast(&((mailboxes_t *)MEM_MAILBOX_BASE)->profiler.buffer[briscIndex][kernel_profiler::CUSTOM_MARKERS]); uint32_t syncTimeBufferIndex = 0; diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index a2d83d50e06..30cef848831 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -5,11 +5,13 @@ #include #include +#include "llrt/hal.hpp" #include "tt_metal/host_api.hpp" #include "impl/debug/dprint_server.hpp" #include "tools/profiler/profiler.hpp" #include "hostdevcommon/profiler_common.h" +#include "dev_msgs.h" #include "tt_metal/detail/tt_metal.hpp" @@ -63,24 +65,24 @@ void setControlBuffer(uint32_t device_id, std::vector& control_buffer) auto ethCores = soc_d.get_physical_ethernet_cores() ; for (auto &core : soc_d.physical_routing_to_profiler_flat_id) { + profiler_msg_t *profiler_msg; + // TODO: clean this up when HAL is more complete (one lookup w/ type) if (std::find(ethCores.begin(), ethCores.end(), core.first) == ethCores.end()) { //Tensix - tt::llrt::write_hex_vec_to_core( - device_id, - core.first, - control_buffer, - PROFILER_L1_BUFFER_CONTROL); + profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalMemAddrType::PROFILER); } else { //ETH - tt::llrt::write_hex_vec_to_core( - device_id, - core.first, - control_buffer, - eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL); + profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalMemAddrType::PROFILER); } + + tt::llrt::write_hex_vec_to_core( + device_id, + core.first, + control_buffer, + reinterpret_cast(profiler_msg->control_vector)); } #endif } @@ -129,13 +131,16 @@ void syncDeviceHost(Device *device, CoreCoord logical_core, std::shared_ptr writeTimes(sampleCount); + profiler_msg_t *profiler_msg = device->get_dev_addr(core, HalMemAddrType::PROFILER); + uint64_t control_addr = reinterpret_cast(&profiler_msg->control_vector[kernel_profiler::FW_RESET_L]); for (int i = 0; i < sampleCount; i++) { ZoneScopedC(tracy::Color::Tomato2); std::this_thread::sleep_for(std::chrono::milliseconds(millisecond_wait)); int64_t writeStart = TracyGetCpuTime(); uint32_t sinceStart = writeStart - hostStartTime; - tt::Cluster::instance().write_reg(&sinceStart, tt_cxy_pair(device_id, core) , PROFILER_L1_BUFFER_CONTROL + kernel_profiler::FW_RESET_L * sizeof(uint32_t)); + + tt::Cluster::instance().write_reg(&sinceStart, tt_cxy_pair(device_id, core), control_addr); writeTimes[i] = (TracyGetCpuTime() - writeStart); } @@ -153,10 +158,14 @@ void syncDeviceHost(Device *device, CoreCoord logical_core, std::shared_ptr(&profiler_msg->buffer[briscIndex][kernel_profiler::CUSTOM_MARKERS]); + vector sync_times = tt::llrt::read_hex_vec_from_core( device_id, core, - PROFILER_L1_BUFFER_BR + kernel_profiler::CUSTOM_MARKERS * sizeof(uint32_t), + addr, (sampleCount + 1) * 2 * sizeof(uint32_t)); uint32_t preDeviceTime = 0; @@ -258,7 +267,7 @@ void syncDeviceHost(Device *device, CoreCoord logical_core, std::shared_ptrid(); - std::vector control_buffer(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + std::vector control_buffer(kernel_profiler::PROFILER_L1_CONTROL_VECTOR_SIZE, 0); setControlBuffer (device_id, control_buffer); } @@ -310,7 +319,7 @@ void InitDeviceProfiler(Device *device){ tt_metal_device_profiler_map.at(device_id).output_dram_buffer = tt_metal::CreateBuffer(dram_config); } - std::vector control_buffer(PROFILER_L1_CONTROL_VECTOR_SIZE, 0); + std::vector control_buffer(kernel_profiler::PROFILER_L1_CONTROL_VECTOR_SIZE, 0); control_buffer[kernel_profiler::DRAM_PROFILER_ADDRESS] = tt_metal_device_profiler_map.at(device_id).output_dram_buffer->address(); setControlBuffer (device_id, control_buffer); @@ -402,11 +411,12 @@ void DumpDeviceProfileResults(Device *device, std::vector &worker_cor for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs, dispatch_core_type)) { const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); + profiler_msg_t *profiler_msg = device->get_dev_addr(curr_core, HalMemAddrType::PROFILER); vector control_buffer = tt::llrt::read_hex_vec_from_core( device_id, curr_core, - PROFILER_L1_BUFFER_CONTROL, - PROFILER_L1_CONTROL_BUFFER_SIZE); + reinterpret_cast(profiler_msg->control_vector), + kernel_profiler::PROFILER_L1_CONTROL_BUFFER_SIZE); if (control_buffer[kernel_profiler::PROFILER_DONE] == 0) { unfinishedCore = curr_core; @@ -420,11 +430,13 @@ void DumpDeviceProfileResults(Device *device, std::vector &worker_cor } for (const CoreCoord& core : tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores) { + const auto curr_core = device->physical_core_from_logical_core(core, CoreType::ETH); + profiler_msg_t *profiler_msg = device->get_dev_addr(curr_core, HalMemAddrType::PROFILER); vector control_buffer = tt::llrt::read_hex_vec_from_core( device_id, core, - eth_l1_mem::address_map::PROFILER_L1_BUFFER_CONTROL, - PROFILER_L1_CONTROL_BUFFER_SIZE); + reinterpret_cast(profiler_msg->control_vector), + kernel_profiler::PROFILER_L1_CONTROL_BUFFER_SIZE); if (control_buffer[kernel_profiler::PROFILER_DONE] == 0) { unfinishedCore = core;