From 595932a31b43447cad82ebdfd83d784ab82c5b05 Mon Sep 17 00:00:00 2001 From: David Ma Date: Thu, 26 Sep 2024 22:05:12 +0000 Subject: [PATCH] #7844: Re-implement NOC xfer size recording to be more efficient Now keeps histograms in L1, overloading dprint buffers, and is read out using watcher dump tool. This way neither dprint nor watcher needs to be active to measure noc transfers. --- tt_metal/hostdevcommon/dprint_common.h | 3 +- tt_metal/hw/inc/debug/dprint.h | 3 - tt_metal/hw/inc/debug/dprint_buffer.h | 2 + tt_metal/hw/inc/debug/noc_logging.h | 30 ++++++ tt_metal/hw/inc/debug/sanitize_noc.h | 15 +-- tt_metal/impl/CMakeLists.txt | 1 + tt_metal/impl/debug/debug_helpers.hpp | 65 ++++++++++++ tt_metal/impl/debug/dprint_server.cpp | 96 +++-------------- tt_metal/impl/debug/noc_logging.cpp | 106 +++++++++++++++++++ tt_metal/impl/debug/noc_logging.hpp | 12 +++ tt_metal/impl/device/device_pool.cpp | 1 + tt_metal/impl/device/device_pool.hpp | 1 + tt_metal/jit_build/build.cpp | 4 + tt_metal/llrt/rtoptions.cpp | 4 +- tt_metal/llrt/rtoptions.hpp | 6 +- tt_metal/tools/watcher_dump/watcher_dump.cpp | 31 +++++- 16 files changed, 273 insertions(+), 107 deletions(-) create mode 100644 tt_metal/hw/inc/debug/noc_logging.h create mode 100644 tt_metal/impl/debug/debug_helpers.hpp create mode 100644 tt_metal/impl/debug/noc_logging.cpp create mode 100644 tt_metal/impl/debug/noc_logging.hpp diff --git a/tt_metal/hostdevcommon/dprint_common.h b/tt_metal/hostdevcommon/dprint_common.h index 9f9718bad57..a517c163cd4 100644 --- a/tt_metal/hostdevcommon/dprint_common.h +++ b/tt_metal/hostdevcommon/dprint_common.h @@ -43,7 +43,6 @@ enum DebugPrintHartIndex : unsigned int { DPRINT_PREFIX(WAIT) \ DPRINT_PREFIX(BFLOAT16) \ DPRINT_PREFIX(SETPRECISION) \ - DPRINT_PREFIX(NOC_LOG_XFER) \ DPRINT_PREFIX(FIXED) \ DPRINT_PREFIX(DEFAULTFLOAT) \ DPRINT_PREFIX(HEX) \ @@ -121,3 +120,5 @@ enum TypedU32_ARRAY_Format { }; static_assert(sizeof(DebugPrintMemLayout) == DPRINT_BUFFER_SIZE); +// We use DebugPrintMemLayout to hold noc xfer data, 32 buckets (one for each bit in noc xfer length field). +static_assert(sizeof(DebugPrintMemLayout().data) >= sizeof(uint32_t) * 8 * sizeof(uint32_t)); diff --git a/tt_metal/hw/inc/debug/dprint.h b/tt_metal/hw/inc/debug/dprint.h index 92650d15a2b..3d84439b317 100644 --- a/tt_metal/hw/inc/debug/dprint.h +++ b/tt_metal/hw/inc/debug/dprint.h @@ -82,7 +82,6 @@ struct HEX { char tmp; } ATTR_PACK; // Analog of cout << std::hex struct OCT { char tmp; } ATTR_PACK; // Analog of cout << std::oct struct DEC { char tmp; } ATTR_PACK; // Analog of cout << std::dec struct SETW { char w; SETW(char w) : w(w) {} } ATTR_PACK; // Analog of cout << std::setw() -struct NOC_LOG_XFER { uint32_t size; NOC_LOG_XFER(uint32_t sz) : size(sz) {} } ATTR_PACK; // For tracking noc transactions. struct U32_ARRAY { uint32_t* ptr; uint32_t len; U32_ARRAY(uint32_t* ptr, uint32_t len) : ptr(ptr), len(len) {} @@ -142,7 +141,6 @@ template<> uint8_t DebugPrintTypeToId() { return DPrintRAISE; } template<> uint8_t DebugPrintTypeToId() { return DPrintWAIT; } template<> uint8_t DebugPrintTypeToId() { return DPrintBFLOAT16; } template<> uint8_t DebugPrintTypeToId() { return DPrintSETPRECISION; } -template<> uint8_t DebugPrintTypeToId() { return DPrintNOC_LOG_XFER; } template<> uint8_t DebugPrintTypeToId() { return DPrintFIXED; } template<> uint8_t DebugPrintTypeToId() { return DPrintDEFAULTFLOAT; } template<> uint8_t DebugPrintTypeToId() { return DPrintHEX; } @@ -297,7 +295,6 @@ template DebugPrinter operator<< (DebugPrinter, HEX val); template DebugPrinter operator<< (DebugPrinter, OCT val); template DebugPrinter operator<< (DebugPrinter, DEC val); template DebugPrinter operator<< (DebugPrinter, SETPRECISION val); -template DebugPrinter operator<< (DebugPrinter, NOC_LOG_XFER val); template DebugPrinter operator<< (DebugPrinter, BF16 val); template DebugPrinter operator<< (DebugPrinter, F32 val); template DebugPrinter operator<< (DebugPrinter, U32 val); diff --git a/tt_metal/hw/inc/debug/dprint_buffer.h b/tt_metal/hw/inc/debug/dprint_buffer.h index 3abf80503ce..0bdb3afa67b 100644 --- a/tt_metal/hw/inc/debug/dprint_buffer.h +++ b/tt_metal/hw/inc/debug/dprint_buffer.h @@ -7,6 +7,8 @@ #include "tt_metal/hostdevcommon/dprint_common.h" #include +#include "hostdevcommon/dprint_common.h" + // Returns the buffer address for current thread+core. Differs for NC/BR/ER/TR0-2. inline uint8_t* get_debug_print_buffer() { #if defined(COMPILE_FOR_NCRISC) diff --git a/tt_metal/hw/inc/debug/noc_logging.h b/tt_metal/hw/inc/debug/noc_logging.h new file mode 100644 index 00000000000..91a117b3eee --- /dev/null +++ b/tt_metal/hw/inc/debug/noc_logging.h @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "dprint_buffer.h" + +// Add option to skip noc logging for certain cores via a define. +#if defined(NOC_LOGGING_ENABLED) && !defined(SKIP_NOC_LOGGING) +void log_noc_xfer(uint32_t len) { + // Hijack print buffer for noc logging data. + volatile tt_l1_ptr uint32_t *buf_ptr = + (volatile tt_l1_ptr uint32_t *)(reinterpret_cast(get_debug_print_buffer())->data); + + int highest_bit_position = 0; + while (len >>= 1) highest_bit_position++; + + buf_ptr[highest_bit_position]++; +} + +#define LOG_LEN(l) log_noc_xfer(l); +#define LOG_READ_LEN_FROM_STATE(noc_id) LOG_LEN(NOC_CMD_BUF_READ_REG(noc_id, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE)); +#define LOG_WRITE_LEN_FROM_STATE(noc_id) LOG_LEN(NOC_CMD_BUF_READ_REG(noc_id, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE)); + +#else + +#define LOG_LEN(l) +#define LOG_READ_LEN_FROM_STATE(noc_id) +#define LOG_WRITE_LEN_FROM_STATE(noc_id) +#endif diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h index 31832db14d0..de70b3d68ea 100644 --- a/tt_metal/hw/inc/debug/sanitize_noc.h +++ b/tt_metal/hw/inc/debug/sanitize_noc.h @@ -14,19 +14,8 @@ // #pragma once -#include "dprint.h" - -// Add the ability to skip NOC logging, we can't have the tunneling cores stalling waiting for the -// print server. -#if !defined(SKIP_NOC_LOGGING) -#define LOG_LEN(l) DPRINT << NOC_LOG_XFER(l); -#define LOG_READ_LEN_FROM_STATE(noc_id) LOG_LEN(NOC_CMD_BUF_READ_REG(noc_id, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE)); -#define LOG_WRITE_LEN_FROM_STATE(noc_id) LOG_LEN(NOC_CMD_BUF_READ_REG(noc_id, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE)); -#else -#define LOG_LEN(l) -#define LOG_READ_LEN_FROM_STATE(noc_id) -#define LOG_WRITE_LEN_FROM_STATE(noc_id) -#endif +// NOC logging enabled independently of watcher, need to include it here because it hooks into DEBUG_SANITIZE_NOC_* +#include "noc_logging.h" #if ( \ defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC) || \ diff --git a/tt_metal/impl/CMakeLists.txt b/tt_metal/impl/CMakeLists.txt index 7d3029d4835..ab986d1229e 100644 --- a/tt_metal/impl/CMakeLists.txt +++ b/tt_metal/impl/CMakeLists.txt @@ -18,6 +18,7 @@ set(IMPL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/worker_config_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dispatch/data_collection.cpp ${CMAKE_CURRENT_SOURCE_DIR}/debug/dprint_server.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/debug/noc_logging.cpp ${CMAKE_CURRENT_SOURCE_DIR}/debug/watcher_server.cpp ${CMAKE_CURRENT_SOURCE_DIR}/debug/watcher_device_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/trace/trace.cpp diff --git a/tt_metal/impl/debug/debug_helpers.hpp b/tt_metal/impl/debug/debug_helpers.hpp new file mode 100644 index 00000000000..4bfb1207ef1 --- /dev/null +++ b/tt_metal/impl/debug/debug_helpers.hpp @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "hostdevcommon/dprint_common.h" +#include "tt_metal/impl/device/device.hpp" + +// Helper function for comparing CoreDescriptors for using in sets. +struct CoreDescriptorComparator { + bool operator()(const CoreDescriptor &x, const CoreDescriptor &y) const { + if (x.coord == y.coord) { + return x.type < y.type; + } else { + return x.coord < y.coord; + } + } +}; +#define CoreDescriptorSet std::set + +// Helper function to get CoreDescriptors for all debug-relevant cores on device. +static CoreDescriptorSet GetAllCores(Device *device) { + CoreDescriptorSet all_cores; + // The set of all printable cores is Tensix + Eth cores + CoreCoord logical_grid_size = device->logical_grid_size(); + for (uint32_t x = 0; x < logical_grid_size.x; x++) { + for (uint32_t y = 0; y < logical_grid_size.y; y++) { + all_cores.insert({{x, y}, CoreType::WORKER}); + } + } + for (const auto& logical_core : device->get_active_ethernet_cores()) { + all_cores.insert({logical_core, CoreType::ETH}); + } + for (const auto& logical_core : device->get_inactive_ethernet_cores()) { + all_cores.insert({logical_core, CoreType::ETH}); + } + + return all_cores; +} + +// Helper function to get CoreDescriptors for all cores that are used for dispatch. Should be a subset of +// GetAllCores(). +static CoreDescriptorSet GetDispatchCores(Device* device) { + CoreDescriptorSet dispatch_cores; + unsigned num_cqs = device->num_hw_cqs(); + CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); + tt::log_warning("Dispatch Core Type = {}", dispatch_core_type); + for (auto logical_core : tt::get_logical_dispatch_cores(device->id(), num_cqs, dispatch_core_type)) { + dispatch_cores.insert({logical_core, dispatch_core_type}); + } + return dispatch_cores; +} + +inline uint64_t GetDprintBufAddr(Device *device, const CoreCoord &phys_core, int risc_id) { + + dprint_buf_msg_t *buf = device->get_dev_addr(phys_core, HalMemAddrType::DPRINT); + return reinterpret_cast(buf->data[risc_id]); +} + +inline int GetNumRiscs(const CoreDescriptor &core) { + return (core.type == CoreType::ETH)? DPRINT_NRISCVS_ETH : DPRINT_NRISCVS; +} diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp index 7545a85caa7..2cee32b104d 100644 --- a/tt_metal/impl/debug/dprint_server.cpp +++ b/tt_metal/impl/debug/dprint_server.cpp @@ -16,6 +16,7 @@ #include "tt_metal/common/logger.hpp" #include "dprint_server.hpp" +#include "debug_helpers.hpp" #include "llrt/tt_cluster.hpp" #include "llrt/rtoptions.hpp" @@ -43,18 +44,6 @@ namespace { static string logfile_path = "generated/dprint/"; -// Helper function for comparing CoreDescriptors for using in sets. -struct CoreDescriptorComparator { - bool operator()(const CoreDescriptor &x, const CoreDescriptor &y) const { - if (x.coord == y.coord) { - return x.type < y.type; - } else { - return x.coord < y.coord; - } - } -}; -#define CoreDescriptorSet set - static inline float bfloat16_to_float(uint16_t bfloat_val) { uint32_t uint32_data = ((uint32_t)bfloat_val) << 16; float f; @@ -87,49 +76,6 @@ static std::string GetRiscName(CoreType core_type, int hart_id) { return fmt::format("UNKNOWN_RISC_ID({})", hart_id); } -static inline uint64_t GetBaseAddr(Device *device, const CoreCoord &phys_core, int hart_id) { - - dprint_buf_msg_t *buf = device->get_dev_addr(phys_core, HalMemAddrType::DPRINT); - - return reinterpret_cast(buf->data[hart_id]); -} - -static inline int GetNumRiscs(const CoreDescriptor &core) { - return (core.type == CoreType::ETH)? DPRINT_NRISCVS_ETH : DPRINT_NRISCVS; -} - -// Helper function to get all (logical) printable cores on a device -static CoreDescriptorSet get_all_printable_cores(Device *device) { - CoreDescriptorSet all_printable_cores; - // The set of all printable cores is Tensix + Eth cores - CoreCoord logical_grid_size = device->logical_grid_size(); - for (uint32_t x = 0; x < logical_grid_size.x; x++) { - for (uint32_t y = 0; y < logical_grid_size.y; y++) { - all_printable_cores.insert({{x, y}, CoreType::WORKER}); - } - } - for (const auto& logical_core : device->get_active_ethernet_cores()) { - all_printable_cores.insert({logical_core, CoreType::ETH}); - } - for (const auto& logical_core : device->get_inactive_ethernet_cores()) { - all_printable_cores.insert({logical_core, CoreType::ETH}); - } - - return all_printable_cores; -} - -// Helper function to get all (logical) printable cores that are used for dispatch. Should be a subset of -// get_all_printable_cores(). -static CoreDescriptorSet get_dispatch_printable_cores(Device* device) { - CoreDescriptorSet printable_dispatch_cores; - unsigned num_cqs = tt::llrt::OptionsG.get_num_hw_cqs(); - CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id()); - for (auto logical_core : tt::get_logical_dispatch_cores(device->id(), num_cqs, dispatch_core_type)) { - printable_dispatch_cores.insert({logical_core, dispatch_core_type}); - } - return printable_dispatch_cores; -} - // A null stream for when the print server is muted. class NullBuffer : public std::streambuf { public: @@ -195,8 +141,6 @@ struct DebugPrintServerContext { std::ofstream* outfile_ = nullptr; // non-cout std::ostream* stream_ = nullptr; // either == outfile_ or is &cout - std::ofstream* noc_log_ = nullptr; - std::map noc_xfer_counts; // For printing each riscs dprint to a separate file, a map from {device id, core coord x, y, hard index} to files. std::map, std::ofstream *> risc_to_stream_; @@ -369,7 +313,7 @@ static void PrintTypedUint32Array(ostream& stream, int setwidth, uint32_t raw_el // Used for debug print server startup sequence. void WriteInitMagic(Device *device, const CoreCoord& phys_core, int hart_id, bool enabled) { // compute the buffer address for the requested hart - uint64_t base_addr = GetBaseAddr(device, phys_core, hart_id); + uint64_t base_addr = GetDprintBufAddr(device, phys_core, hart_id); // TODO(AP): this could use a cleanup - need a different mechanism to know if a kernel is running on device. // Force wait for first kernel launch by first writing a non-zero and waiting for a zero. @@ -384,7 +328,7 @@ void WriteInitMagic(Device *device, const CoreCoord& phys_core, int hart_id, boo // Note that this is not a bulletproof way to bootstrap the print server (TODO(AP)) bool CheckInitMagicCleared(Device *device, const CoreCoord& phys_core, int hart_id) { // compute the buffer address for the requested hart - uint32_t base_addr = GetBaseAddr(device, phys_core, hart_id); + uint32_t base_addr = GetDprintBufAddr(device, phys_core, hart_id); vector initbuf = { DEBUG_PRINT_SERVER_STARTING_MAGIC }; auto result = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, base_addr, 4); @@ -414,7 +358,6 @@ DebugPrintServerContext::DebugPrintServerContext() { outfile_ = new std::ofstream(file_name); } stream_ = outfile_ ? outfile_ : &cout; - noc_log_ = new std::ofstream("noc_log.csv"); stop_print_server_ = false; mute_print_server_ = false; @@ -447,10 +390,6 @@ DebugPrintServerContext::~DebugPrintServerContext() { key_and_stream.second->close(); delete key_and_stream.second; } - for (auto &size_and_count : noc_xfer_counts) - *noc_log_ << size_and_count.first << "," << size_and_count.second << "\n"; - noc_log_->close(); - delete noc_log_; inst = nullptr; } // ~DebugPrintServerContext @@ -480,15 +419,15 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // A set of all valid printable cores, used for checking the user input. Note that the coords // here are physical. - CoreDescriptorSet all_printable_cores = get_all_printable_cores(device); - CoreDescriptorSet dispatch_printable_cores = get_dispatch_printable_cores(device); + CoreDescriptorSet all_cores = GetAllCores(device); + CoreDescriptorSet dispatch_cores = GetDispatchCores(device); // Initialize all print buffers on all cores on the device to have print disabled magic. We // will then write print enabled magic for only the cores the user has specified to monitor. // This way in the kernel code (dprint.h) we can detect whether the magic value is present and // skip prints entirely to prevent kernel code from hanging waiting for the print buffer to be // flushed from the host. - for (auto &logical_core : all_printable_cores) { + for (auto &logical_core : all_cores) { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { WriteInitMagic(device, phys_core, hart_index, false); @@ -508,7 +447,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { if (tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) == tt::llrt::RunTimeDebugClassAll) { // Print from all cores of the given type, cores returned here are guaranteed to be valid. - for (CoreDescriptor logical_core : all_printable_cores) { + for (CoreDescriptor logical_core : all_cores) { if (logical_core.type == core_type) print_cores_sanitized.push_back(logical_core); } @@ -520,7 +459,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { } else if ( tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) == tt::llrt::RunTimeDebugClassDispatch) { - for (CoreDescriptor logical_core : dispatch_printable_cores) { + for (CoreDescriptor logical_core : dispatch_cores) { if (logical_core.type == core_type) print_cores_sanitized.push_back(logical_core); } @@ -533,8 +472,8 @@ void DebugPrintServerContext::AttachDevice(Device* device) { tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) == tt::llrt::RunTimeDebugClassWorker) { // For worker cores, take all cores and remove dispatch cores. - for (CoreDescriptor logical_core : all_printable_cores) { - if (dispatch_printable_cores.find(logical_core) == dispatch_printable_cores.end()) { + for (CoreDescriptor logical_core : all_cores) { + if (dispatch_cores.find(logical_core) == dispatch_cores.end()) { if (logical_core.type == core_type) print_cores_sanitized.push_back(logical_core); } @@ -560,7 +499,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { } catch (std::runtime_error& error) { valid_logical_core = false; } - if (valid_logical_core && all_printable_cores.count({logical_core, core_type}) > 0) { + if (valid_logical_core && all_cores.count({logical_core, core_type}) > 0) { print_cores_sanitized.push_back({logical_core, core_type}); log_info( tt::LogMetal, @@ -632,7 +571,7 @@ void DebugPrintServerContext::DetachDevice(Device* device) { // Check if rpos < wpos, indicating unprocessed prints. constexpr int eightbytes = 8; - uint32_t base_addr = GetBaseAddr(device, phys_core, risc_id); + uint32_t base_addr = GetDprintBufAddr(device, phys_core, risc_id); auto from_dev = tt::llrt::read_hex_vec_from_core(chip_id, phys_core, base_addr, eightbytes); uint32_t wpos = from_dev[0], rpos = from_dev[1]; if (rpos < wpos) { @@ -654,8 +593,8 @@ void DebugPrintServerContext::DetachDevice(Device* device) { log_info(tt::LogMetal, "DPRINT Server dettached device {}", device->id()); // When detaching a device, disable prints on it. - CoreDescriptorSet all_printable_cores = get_all_printable_cores(device); - for (auto &logical_core : all_printable_cores) { + CoreDescriptorSet all_cores = GetAllCores(device); + for (auto &logical_core : all_cores) { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { WriteInitMagic(device, phys_core, hart_index, false); @@ -691,7 +630,7 @@ bool DebugPrintServerContext::PeekOneHartNonBlocking( return false; // compute the buffer address for the requested hart - uint32_t base_addr = GetBaseAddr(device, phys_core, hart_id); + uint32_t base_addr = GetDprintBufAddr(device, phys_core, hart_id); chip_id_t chip_id = device->id(); // Device is incrementing wpos @@ -824,11 +763,6 @@ bool DebugPrintServerContext::PeekOneHartNonBlocking( stream << std::setprecision(*ptr); TT_ASSERT(sz == 1); break; - case DPrintNOC_LOG_XFER: - if (tt::llrt::OptionsG.get_dprint_noc_transfers()) - noc_xfer_counts[*reinterpret_cast(ptr)]++; - TT_ASSERT(sz == 4); - break; case DPrintFIXED: stream << std::fixed; TT_ASSERT(sz == 1); diff --git a/tt_metal/impl/debug/noc_logging.cpp b/tt_metal/impl/debug/noc_logging.cpp new file mode 100644 index 00000000000..2d07fa593fe --- /dev/null +++ b/tt_metal/impl/debug/noc_logging.cpp @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "noc_logging.hpp" + +#include +#include +#include +//#include +#include + +#include "debug_helpers.hpp" +#include "hostdevcommon/dprint_common.h" +#include "tt_metal/impl/device/device.hpp" + +// 32 buckets to match the number of bits in uint32_t lengths on device +#define NOC_DATA_SIZE sizeof(uint32_t) * 8 +using noc_data_t = std::array; + +namespace tt { + +static string logfile_path = "generated/noc_data/"; +void PrintNocData(noc_data_t noc_data, string file_name) { + std::filesystem::path output_dir(tt::llrt::OptionsG.get_root_dir() + logfile_path); + std::filesystem::create_directories(output_dir); + std::string filename = tt::llrt::OptionsG.get_root_dir() + logfile_path + file_name; + std::ofstream outfile(filename); + + for (uint32_t idx = 0; idx < NOC_DATA_SIZE; idx++) { + uint64_t lower = 1UL << idx; + uint64_t upper = 1UL << (idx + 1); + outfile << fmt::format("[{},{}): {}\n", lower, upper, noc_data[idx]); + } + + outfile.close(); +} + +void DumpCoreNocData(Device *device, const CoreDescriptor &logical_core, noc_data_t &noc_data) { + CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { + // Read out the DPRINT buffer, we stored our data in the "data field" + uint64_t addr = GetDprintBufAddr(device, phys_core, risc_id); + auto from_dev = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, addr, DPRINT_BUFFER_SIZE); + DebugPrintMemLayout* l = reinterpret_cast(from_dev.data()); + uint32_t *data = reinterpret_cast(l->data); + + // Append the data for this core to existing data + for (int idx = 0; idx < NOC_DATA_SIZE; idx++) { + noc_data[idx] += data[idx]; + } + } +} + +void DumpDeviceNocData(Device *device, noc_data_t &noc_data, noc_data_t &dispatch_noc_data) { + // Need to treat dispatch cores and normal cores separately, so keep track of which cores are dispatch. + CoreDescriptorSet dispatch_cores = GetDispatchCores(device); + + // Now go through all cores on the device, and dump noc data for them. + CoreDescriptorSet all_cores = GetAllCores(device); + for (const CoreDescriptor &logical_core : all_cores) { + if (dispatch_cores.count(logical_core)) { + DumpCoreNocData(device, logical_core, dispatch_noc_data); + } else { + DumpCoreNocData(device, logical_core, noc_data); + } + } +} + +void DumpNocData(std::vector devices) { + // Skip if feature is not enabled + if (!tt::llrt::OptionsG.get_record_noc_transfers()) + return; + + noc_data_t noc_data = {}, dispatch_noc_data = {}; + for (Device *device : devices) { + log_info("Dumping noc data for Device {}...", device->id()); + DumpDeviceNocData(device, noc_data, dispatch_noc_data); + } + + PrintNocData(noc_data, "noc_data.txt"); + PrintNocData(dispatch_noc_data, "dispatch_noc_data.txt"); +} + +void ClearNocData(Device *device) { + // Skip if feature is not enabled + if (!tt::llrt::OptionsG.get_record_noc_transfers()) + return; + + // This feature is incomatible with dprint since they share memory space + TT_FATAL( + tt::llrt::OptionsG.get_feature_enabled(tt::llrt::RunTimeDebugFeatureDprint) == false, + "NOC transfer recording is incompatible with DPRINT"); + + CoreDescriptorSet all_cores = GetAllCores(device); + for (const CoreDescriptor &logical_core : all_cores) { + CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { + uint64_t addr = GetDprintBufAddr(device, phys_core, risc_id); + vector initbuf = vector(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0); + tt::llrt::write_hex_vec_to_core(device->id(), phys_core, initbuf, addr); + } + } +} + +} // namespace tt diff --git a/tt_metal/impl/debug/noc_logging.hpp b/tt_metal/impl/debug/noc_logging.hpp new file mode 100644 index 00000000000..cfaa03957b8 --- /dev/null +++ b/tt_metal/impl/debug/noc_logging.hpp @@ -0,0 +1,12 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "tt_metal/impl/device/device.hpp" + +namespace tt { +void ClearNocData(Device *device); +void DumpNocData(std::vector devices); +} diff --git a/tt_metal/impl/device/device_pool.cpp b/tt_metal/impl/device/device_pool.cpp index 1c9b264c33e..9caf77961fc 100644 --- a/tt_metal/impl/device/device_pool.cpp +++ b/tt_metal/impl/device/device_pool.cpp @@ -159,6 +159,7 @@ void DevicePool::initialize_device(Device* dev) const { TT_ASSERT(dev->num_hw_cqs() == 1, "num_hw_cqs must be 1 in slow dispatch"); } + ClearNocData(dev); DprintServerAttach(dev); watcher_init(dev); diff --git a/tt_metal/impl/device/device_pool.hpp b/tt_metal/impl/device/device_pool.hpp index 3fcf99cbb60..7c8d7b0453a 100644 --- a/tt_metal/impl/device/device_pool.hpp +++ b/tt_metal/impl/device/device_pool.hpp @@ -5,6 +5,7 @@ #pragma once #include "impl/debug/dprint_server.hpp" +#include "impl/debug/noc_logging.hpp" #include "impl/debug/watcher_server.hpp" #include "tt_metal/impl/device/device.hpp" #include "tt_metal/third_party/umd/device/tt_cluster_descriptor.h" diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 5a208e67993..e7450d2f5d5 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -108,6 +108,10 @@ void JitBuildEnv::init(uint32_t build_key, tt::ARCH arch) { this->defines_ += "-DDEBUG_PRINT_ENABLED "; } + if (tt::llrt::OptionsG.get_record_noc_transfers()) { + this->defines_ += "-DNOC_LOGGING_ENABLED "; + } + if (tt::llrt::OptionsG.get_kernels_nullified()) { this->defines_ += "-DDEBUG_NULL_KERNELS "; } diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 15879dfa76a..00494a636b7 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -212,9 +212,9 @@ void RunTimeOptions::ParseFeatureEnv(RunTimeDebugFeatures feature) { if (core_type_and_cores.second.size() > 0) feature_targets[feature].enabled = true; - const char *print_noc_xfers = std::getenv("TT_METAL_DPRINT_NOC_TRANSFER_DATA"); + const char *print_noc_xfers = std::getenv("TT_METAL_RECORD_NOC_TRANSFER_DATA"); if (print_noc_xfers != nullptr) - dprint_noc_transfer_data = true; + record_noc_transfer_data = true; }; void RunTimeOptions::ParseFeatureCoreRange( diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp index 4f2c1841a5a..8bcfb7acadb 100644 --- a/tt_metal/llrt/rtoptions.hpp +++ b/tt_metal/llrt/rtoptions.hpp @@ -98,7 +98,7 @@ class RunTimeOptions { bool watcher_append = false; bool watcher_auto_unpause = false; bool watcher_noinline = false; - bool dprint_noc_transfer_data = false; + bool record_noc_transfer_data = false; TargetSelection feature_targets[RunTimeDebugFeatureCount]; @@ -221,8 +221,8 @@ class RunTimeOptions { feature_targets[feature] = targets; } - inline bool get_dprint_noc_transfers() { return dprint_noc_transfer_data; } - inline void set_dprint_noc_transfers(bool val) { dprint_noc_transfer_data = val; } + inline bool get_record_noc_transfers() { return record_noc_transfer_data; } + inline void set_record_noc_transfers(bool val) { record_noc_transfer_data = val; } inline bool get_validate_kernel_binaries() { return validate_kernel_binaries; } inline void set_validate_kernel_binaries(bool val) { validate_kernel_binaries = val; } diff --git a/tt_metal/tools/watcher_dump/watcher_dump.cpp b/tt_metal/tools/watcher_dump/watcher_dump.cpp index 45d5bf63cac..1d56357824e 100644 --- a/tt_metal/tools/watcher_dump/watcher_dump.cpp +++ b/tt_metal/tools/watcher_dump/watcher_dump.cpp @@ -5,6 +5,7 @@ #include #include "tt_metal/host_api.hpp" #include "impl/debug/watcher_server.hpp" +#include "impl/debug/noc_logging.hpp" #include "impl/dispatch/debug_tools.hpp" using namespace tt; @@ -15,7 +16,14 @@ using std::vector; string output_dir_name = "generated/watcher/"; string logfile_name = "cq_dump.txt"; -void dump_data(vector& device_ids, bool dump_watcher, bool dump_cqs, bool dump_cqs_raw_data, int num_hw_cqs) { +void dump_data( + vector& device_ids, + bool dump_watcher, + bool dump_cqs, + bool dump_cqs_raw_data, + bool dump_noc_xfers, + bool eth_dispatch, + int num_hw_cqs) { // Don't clear L1, this way we can dump the state. llrt::OptionsG.set_clear_l1(false); @@ -33,13 +41,15 @@ void dump_data(vector& device_ids, bool dump_watcher, bool dump_cqs, b cout << "Dumping Watcher Log into: " << watcher_get_log_file_name() << endl; // Only look at user-specified devices + vector devices; for (unsigned id : device_ids) { string cq_fname = cq_dir.string() + fmt::format("device_{}_completion_q.txt", id); std::ofstream cq_file = std::ofstream(cq_fname); string iq_fname = cq_dir.string() + fmt::format("device_{}_issue_q.txt", id); std::ofstream iq_file = std::ofstream(iq_fname); // Minimal setup, since we'll be attaching to a potentially hanging chip. - auto* device = tt::tt_metal::CreateDeviceMinimal(id, num_hw_cqs, DispatchCoreType::WORKER); + Device* device = tt::tt_metal::CreateDeviceMinimal(id, num_hw_cqs, eth_dispatch ? DispatchCoreType::ETH : DispatchCoreType::WORKER); + devices.push_back(device); if (dump_cqs) { std::unique_ptr sysmem_manager = std::make_unique(id, num_hw_cqs); @@ -55,6 +65,10 @@ void dump_data(vector& device_ids, bool dump_watcher, bool dump_cqs, b watcher_read_kernel_ids_from_file(); watcher_dump(); } + + // Dump noc data if requested + if (dump_noc_xfers) + DumpNocData(devices); } void print_usage(const char* exec_name) { @@ -69,6 +83,10 @@ void print_usage(const char* exec_name) { cout << "\t-w, --dump-watcher: Dump watcher data, available data depends on whether watcher was enabled for " "original program." << endl; + cout << "\t--dump-noc-transfer-data: Dump NOC transfer data. Data is only available if previous run had " + "TT_METAL_RECORD_NOC_TRANSFER_DATA defined." + << endl; + cout << "\t--eth-dispatch: Assume eth dispatch, should match previous run." << endl; } int main(int argc, char* argv[]) { @@ -81,7 +99,7 @@ int main(int argc, char* argv[]) { } // Go through user args, handle accordingly. - bool dump_watcher = false, dump_cqs = false, dump_cqs_raw_data = false; + bool dump_watcher = false, dump_cqs = false, dump_cqs_raw_data = false, dump_noc_xfers = false, eth_dispatch = false; int num_hw_cqs = 1; for (int idx = 1; idx < argc; idx++) { string s(argv[idx]); @@ -116,6 +134,11 @@ int main(int argc, char* argv[]) { dump_cqs = true; } else if (s == "--dump-cqs-data") { dump_cqs_raw_data = true; + } else if (s == "--dump-noc-transfer-data") { + tt::llrt::OptionsG.set_record_noc_transfers(true); + dump_noc_xfers = true; + } else if (s == "--eth-dispatch") { + eth_dispatch = true; } else { cout << "Error: unrecognized command line argument: " << s << endl; print_usage(argv[0]); @@ -124,6 +147,6 @@ int main(int argc, char* argv[]) { } // Call dump function with user config. - dump_data(device_ids, dump_watcher, dump_cqs, dump_cqs_raw_data, num_hw_cqs); + dump_data(device_ids, dump_watcher, dump_cqs, dump_cqs_raw_data, dump_noc_xfers, eth_dispatch, num_hw_cqs); std::cout << "Watcher dump tool finished." << std::endl; }