From 03e6cffa2376bf47963e60b2f118a395b2aa0362 Mon Sep 17 00:00:00 2001 From: Mo Date: Wed, 2 Oct 2024 15:56:56 +0000 Subject: [PATCH] #6748: Remove profiler flat id lookup table --- tt_metal/hw/firmware/src/brisc.cc | 1 - tt_metal/hw/firmware/src/erisc.cc | 1 - tt_metal/hw/firmware/src/idle_erisc.cc | 1 - tt_metal/hw/firmware/src/ncrisc.cc | 1 - tt_metal/jit_build/genfiles.cpp | 27 ------------------- tt_metal/tools/profiler/kernel_profiler.hpp | 20 ++++++-------- tt_metal/tools/profiler/tt_metal_profiler.cpp | 1 + 7 files changed, 9 insertions(+), 43 deletions(-) diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 090d589102d..278ad9929d8 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -76,7 +76,6 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); - uint16_t core_flat_id __attribute__((used)); } #endif diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index 63713e49b28..461d5ddf66f 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -21,7 +21,6 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); - uint16_t core_flat_id __attribute__((used)); } #endif diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index a92b35c8d41..f92c81b111c 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -57,7 +57,6 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); - uint16_t core_flat_id __attribute__((used)); } #endif diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index 84c5f50e593..48b735afd9a 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -45,7 +45,6 @@ namespace kernel_profiler { uint32_t stackSize __attribute__((used)); uint32_t sums[SUM_COUNT] __attribute__((used)); uint32_t sumIDs[SUM_COUNT] __attribute__((used)); - uint16_t core_flat_id __attribute__((used)); } #endif diff --git a/tt_metal/jit_build/genfiles.cpp b/tt_metal/jit_build/genfiles.cpp index 21f60d98f08..90791cf9a90 100644 --- a/tt_metal/jit_build/genfiles.cpp +++ b/tt_metal/jit_build/genfiles.cpp @@ -581,7 +581,6 @@ std::string generate_bank_to_noc_coord_descriptor_string( ss << "#if defined(PROFILE_KERNEL) && (defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || " "defined(COMPILE_FOR_ERISC))" << endl; - ss << "extern uint8_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y];" << endl; ss << "extern uint16_t profiler_core_count_per_dram;" << endl; ss << "#endif" << endl; #endif @@ -613,38 +612,12 @@ std::string generate_bank_to_noc_coord_descriptor_string( ss << endl; #if defined(TRACY_ENABLE) - /* - * This part is adding the 2D array for sharing the flat IDs soc descriptor has assigned to every NOC coordinate, - * and the ceiled number of cores per DRAM banks. - * - * The logic of flat ID assignment can be optimized to lower NOC traffic. With this design the heuristic can be - * implemented in host and device just does look up to the table. - * - * For DRAM banks in particular, integer division of flat_id/core_count_per_dram gives the dram bank id and the - * modulo is the offset. - * */ ss << "#if defined(PROFILE_KERNEL) && (defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || " "defined(COMPILE_FOR_ERISC))" << endl; ss << "uint16_t profiler_core_count_per_dram __attribute__((used)) = "; ss << core_count_per_dram << ";" << endl; ss << endl; - - ss << "uint8_t noc_xy_to_profiler_flat_id[noc_size_x][noc_size_y] __attribute__((used)) = {" << endl; - for (unsigned int x = 0; x < grid_size.x; x++) { - ss << " {" << endl; - for (unsigned int y = 0; y < grid_size.y; y++) { - CoreCoord core = {x, y}; - if (profiler_flat_id_map.find(core) == profiler_flat_id_map.end()) { - ss << " " << 255 << "," << endl; - } else { - ss << " " << profiler_flat_id_map.at(core) << "," << endl; - } - } - ss << " }," << endl; - } - ss << "};" << endl; - ss << endl; ss << "#endif" << endl; #endif diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 0b50594d080..fb353b4a153 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -50,13 +50,10 @@ namespace kernel_profiler{ #if defined(COMPILE_FOR_BRISC) constexpr uint32_t myRiscID = 0; - extern uint16_t core_flat_id; #elif defined(COMPILE_FOR_ERISC) constexpr uint32_t myRiscID = 0; - extern uint16_t core_flat_id; #elif defined(COMPILE_FOR_NCRISC) constexpr uint32_t myRiscID = 1; - extern uint16_t core_flat_id; #elif COMPILE_FOR_TRISC == 0 constexpr uint32_t myRiscID = 2; #elif COMPILE_FOR_TRISC == 1 @@ -92,21 +89,16 @@ namespace kernel_profiler{ if (runCounter == 0) { - core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; - for (uint32_t riscID = 0; riscID < PROFILER_RISC_COUNT; riscID ++) { for (uint32_t i = ID_HH; i < GUARANTEED_MARKER_1_H; i ++) { profiler_data_buffer[riscID][i] = 0; } - - profiler_data_buffer[riscID][ID_LH] = ((core_flat_id & 0xFF) << 3) | riscID; } profiler_control_buffer[NOC_X] = my_x[0]; profiler_control_buffer[NOC_Y] = my_y[0]; - profiler_control_buffer[FLAT_ID] = core_flat_id; } for (uint32_t riscID = 0; riscID < PROFILER_RISC_COUNT; riscID ++) @@ -199,10 +191,11 @@ namespace kernel_profiler{ PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * MAX_RISCV_PER_CORE * profiler_core_count_per_dram; while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); - uint32_t dram_profiler_address = profiler_control_buffer[DRAM_PROFILER_ADDRESS]; + uint32_t core_flat_id = profiler_control_buffer[FLAT_ID]; for (uint32_t riscID = 0; riscID < PROFILER_RISC_COUNT; riscID ++) { + profiler_data_buffer[riscID][ID_LH] = ((core_flat_id & 0xFF) << 3) | riscID; int hostIndex = riscID; int deviceIndex = kernel_profiler::DEVICE_BUFFER_END_INDEX_BR_ER + riscID; if (profiler_control_buffer[deviceIndex]) @@ -244,7 +237,7 @@ namespace kernel_profiler{ if (do_noc){ const InterleavedAddrGen s = { - .bank_base_address = dram_profiler_address, + .bank_base_address = profiler_control_buffer[DRAM_PROFILER_ADDRESS], .page_size = pageSize }; @@ -271,14 +264,17 @@ namespace kernel_profiler{ SrcLocNameToHash("PROFILER-NOC-QUICK-SEND"); mark_time_at_index_inlined(wIndex, hash); wIndex += PROFILER_L1_MARKER_UINT32_SIZE; - core_flat_id = noc_xy_to_profiler_flat_id[my_x[0]][my_y[0]]; + + while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); + uint32_t core_flat_id = profiler_control_buffer[FLAT_ID]; + + profiler_data_buffer[myRiscID][ID_LH] = ((core_flat_id & 0xFF) << 3) | myRiscID; uint32_t dram_offset = (core_flat_id % profiler_core_count_per_dram) * MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + (HOST_BUFFER_END_INDEX_BR_ER + myRiscID) * PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC + profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] * sizeof(uint32_t); - while (!profiler_control_buffer[DRAM_PROFILER_ADDRESS]); const InterleavedAddrGen s = { .bank_base_address = profiler_control_buffer[DRAM_PROFILER_ADDRESS], .page_size = PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC * MAX_RISCV_PER_CORE * profiler_core_count_per_dram diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index 251c6416d33..ac99eb69db5 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -78,6 +78,7 @@ void setControlBuffer(uint32_t device_id, std::vector& control_buffer) profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalMemAddrType::PROFILER); } + control_buffer[kernel_profiler::FLAT_ID] = core.second; tt::llrt::write_hex_vec_to_core( device_id, core.first,