Skip to content

Commit

Permalink
#7844: Re-implement NOC xfer size recording to be more efficient
Browse files Browse the repository at this point in the history
Now keeps histograms in L1, overloading dprint buffers, and is read out
using watcher dump tool. This way neither dprint nor watcher needs to be
active to measure noc transfers.
  • Loading branch information
tt-dma committed Oct 2, 2024
1 parent 9ef05d8 commit 595932a
Show file tree
Hide file tree
Showing 16 changed files with 273 additions and 107 deletions.
3 changes: 2 additions & 1 deletion tt_metal/hostdevcommon/dprint_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ enum DebugPrintHartIndex : unsigned int {
DPRINT_PREFIX(WAIT) \
DPRINT_PREFIX(BFLOAT16) \
DPRINT_PREFIX(SETPRECISION) \
DPRINT_PREFIX(NOC_LOG_XFER) \
DPRINT_PREFIX(FIXED) \
DPRINT_PREFIX(DEFAULTFLOAT) \
DPRINT_PREFIX(HEX) \
Expand Down Expand Up @@ -121,3 +120,5 @@ enum TypedU32_ARRAY_Format {
};

static_assert(sizeof(DebugPrintMemLayout) == DPRINT_BUFFER_SIZE);
// We use DebugPrintMemLayout to hold noc xfer data, 32 buckets (one for each bit in noc xfer length field).
static_assert(sizeof(DebugPrintMemLayout().data) >= sizeof(uint32_t) * 8 * sizeof(uint32_t));
3 changes: 0 additions & 3 deletions tt_metal/hw/inc/debug/dprint.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ struct HEX { char tmp; } ATTR_PACK; // Analog of cout << std::hex
struct OCT { char tmp; } ATTR_PACK; // Analog of cout << std::oct
struct DEC { char tmp; } ATTR_PACK; // Analog of cout << std::dec
struct SETW { char w; SETW(char w) : w(w) {} } ATTR_PACK; // Analog of cout << std::setw()
struct NOC_LOG_XFER { uint32_t size; NOC_LOG_XFER(uint32_t sz) : size(sz) {} } ATTR_PACK; // For tracking noc transactions.
struct U32_ARRAY {
uint32_t* ptr; uint32_t len;
U32_ARRAY(uint32_t* ptr, uint32_t len) : ptr(ptr), len(len) {}
Expand Down Expand Up @@ -142,7 +141,6 @@ template<> uint8_t DebugPrintTypeToId<RAISE>() { return DPrintRAISE; }
template<> uint8_t DebugPrintTypeToId<WAIT>() { return DPrintWAIT; }
template<> uint8_t DebugPrintTypeToId<BF16>() { return DPrintBFLOAT16; }
template<> uint8_t DebugPrintTypeToId<SETPRECISION>() { return DPrintSETPRECISION; }
template<> uint8_t DebugPrintTypeToId<NOC_LOG_XFER>() { return DPrintNOC_LOG_XFER; }
template<> uint8_t DebugPrintTypeToId<FIXED>() { return DPrintFIXED; }
template<> uint8_t DebugPrintTypeToId<DEFAULTFLOAT>() { return DPrintDEFAULTFLOAT; }
template<> uint8_t DebugPrintTypeToId<HEX>() { return DPrintHEX; }
Expand Down Expand Up @@ -297,7 +295,6 @@ template DebugPrinter operator<< <HEX>(DebugPrinter, HEX val);
template DebugPrinter operator<< <OCT>(DebugPrinter, OCT val);
template DebugPrinter operator<< <DEC>(DebugPrinter, DEC val);
template DebugPrinter operator<< <SETPRECISION>(DebugPrinter, SETPRECISION val);
template DebugPrinter operator<< <NOC_LOG_XFER>(DebugPrinter, NOC_LOG_XFER val);
template DebugPrinter operator<< <BF16>(DebugPrinter, BF16 val);
template DebugPrinter operator<< <F32>(DebugPrinter, F32 val);
template DebugPrinter operator<< <U32>(DebugPrinter, U32 val);
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/hw/inc/debug/dprint_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include "tt_metal/hostdevcommon/dprint_common.h"
#include <dev_msgs.h>

#include "hostdevcommon/dprint_common.h"

// Returns the buffer address for current thread+core. Differs for NC/BR/ER/TR0-2.
inline uint8_t* get_debug_print_buffer() {
#if defined(COMPILE_FOR_NCRISC)
Expand Down
30 changes: 30 additions & 0 deletions tt_metal/hw/inc/debug/noc_logging.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#pragma once
#include "dprint_buffer.h"

// Add option to skip noc logging for certain cores via a define.
#if defined(NOC_LOGGING_ENABLED) && !defined(SKIP_NOC_LOGGING)
void log_noc_xfer(uint32_t len) {
// Hijack print buffer for noc logging data.
volatile tt_l1_ptr uint32_t *buf_ptr =
(volatile tt_l1_ptr uint32_t *)(reinterpret_cast<DebugPrintMemLayout *>(get_debug_print_buffer())->data);

int highest_bit_position = 0;
while (len >>= 1) highest_bit_position++;

buf_ptr[highest_bit_position]++;
}

#define LOG_LEN(l) log_noc_xfer(l);
#define LOG_READ_LEN_FROM_STATE(noc_id) LOG_LEN(NOC_CMD_BUF_READ_REG(noc_id, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE));
#define LOG_WRITE_LEN_FROM_STATE(noc_id) LOG_LEN(NOC_CMD_BUF_READ_REG(noc_id, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE));

#else

#define LOG_LEN(l)
#define LOG_READ_LEN_FROM_STATE(noc_id)
#define LOG_WRITE_LEN_FROM_STATE(noc_id)
#endif
15 changes: 2 additions & 13 deletions tt_metal/hw/inc/debug/sanitize_noc.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,8 @@
//
#pragma once

#include "dprint.h"

// Add the ability to skip NOC logging, we can't have the tunneling cores stalling waiting for the
// print server.
#if !defined(SKIP_NOC_LOGGING)
#define LOG_LEN(l) DPRINT << NOC_LOG_XFER(l);
#define LOG_READ_LEN_FROM_STATE(noc_id) LOG_LEN(NOC_CMD_BUF_READ_REG(noc_id, NCRISC_RD_CMD_BUF, NOC_AT_LEN_BE));
#define LOG_WRITE_LEN_FROM_STATE(noc_id) LOG_LEN(NOC_CMD_BUF_READ_REG(noc_id, NCRISC_WR_CMD_BUF, NOC_AT_LEN_BE));
#else
#define LOG_LEN(l)
#define LOG_READ_LEN_FROM_STATE(noc_id)
#define LOG_WRITE_LEN_FROM_STATE(noc_id)
#endif
// NOC logging enabled independently of watcher, need to include it here because it hooks into DEBUG_SANITIZE_NOC_*
#include "noc_logging.h"

#if ( \
defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_ERISC) || \
Expand Down
1 change: 1 addition & 0 deletions tt_metal/impl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(IMPL_SRC
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/worker_config_buffer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch/data_collection.cpp
${CMAKE_CURRENT_SOURCE_DIR}/debug/dprint_server.cpp
${CMAKE_CURRENT_SOURCE_DIR}/debug/noc_logging.cpp
${CMAKE_CURRENT_SOURCE_DIR}/debug/watcher_server.cpp
${CMAKE_CURRENT_SOURCE_DIR}/debug/watcher_device_reader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/trace/trace.cpp
Expand Down
65 changes: 65 additions & 0 deletions tt_metal/impl/debug/debug_helpers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <set>

#include "hostdevcommon/dprint_common.h"
#include "tt_metal/impl/device/device.hpp"

// Helper function for comparing CoreDescriptors for using in sets.
struct CoreDescriptorComparator {
bool operator()(const CoreDescriptor &x, const CoreDescriptor &y) const {
if (x.coord == y.coord) {
return x.type < y.type;
} else {
return x.coord < y.coord;
}
}
};
#define CoreDescriptorSet std::set<CoreDescriptor, CoreDescriptorComparator>

// Helper function to get CoreDescriptors for all debug-relevant cores on device.
static CoreDescriptorSet GetAllCores(Device *device) {
CoreDescriptorSet all_cores;
// The set of all printable cores is Tensix + Eth cores
CoreCoord logical_grid_size = device->logical_grid_size();
for (uint32_t x = 0; x < logical_grid_size.x; x++) {
for (uint32_t y = 0; y < logical_grid_size.y; y++) {
all_cores.insert({{x, y}, CoreType::WORKER});
}
}
for (const auto& logical_core : device->get_active_ethernet_cores()) {
all_cores.insert({logical_core, CoreType::ETH});
}
for (const auto& logical_core : device->get_inactive_ethernet_cores()) {
all_cores.insert({logical_core, CoreType::ETH});
}

return all_cores;
}

// Helper function to get CoreDescriptors for all cores that are used for dispatch. Should be a subset of
// GetAllCores().
static CoreDescriptorSet GetDispatchCores(Device* device) {
CoreDescriptorSet dispatch_cores;
unsigned num_cqs = device->num_hw_cqs();
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
tt::log_warning("Dispatch Core Type = {}", dispatch_core_type);
for (auto logical_core : tt::get_logical_dispatch_cores(device->id(), num_cqs, dispatch_core_type)) {
dispatch_cores.insert({logical_core, dispatch_core_type});
}
return dispatch_cores;
}

inline uint64_t GetDprintBufAddr(Device *device, const CoreCoord &phys_core, int risc_id) {

dprint_buf_msg_t *buf = device->get_dev_addr<dprint_buf_msg_t *>(phys_core, HalMemAddrType::DPRINT);
return reinterpret_cast<uint64_t>(buf->data[risc_id]);
}

inline int GetNumRiscs(const CoreDescriptor &core) {
return (core.type == CoreType::ETH)? DPRINT_NRISCVS_ETH : DPRINT_NRISCVS;
}
96 changes: 15 additions & 81 deletions tt_metal/impl/debug/dprint_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "tt_metal/common/logger.hpp"

#include "dprint_server.hpp"
#include "debug_helpers.hpp"
#include "llrt/tt_cluster.hpp"
#include "llrt/rtoptions.hpp"

Expand Down Expand Up @@ -43,18 +44,6 @@ namespace {

static string logfile_path = "generated/dprint/";

// Helper function for comparing CoreDescriptors for using in sets.
struct CoreDescriptorComparator {
bool operator()(const CoreDescriptor &x, const CoreDescriptor &y) const {
if (x.coord == y.coord) {
return x.type < y.type;
} else {
return x.coord < y.coord;
}
}
};
#define CoreDescriptorSet set<CoreDescriptor, CoreDescriptorComparator>

static inline float bfloat16_to_float(uint16_t bfloat_val) {
uint32_t uint32_data = ((uint32_t)bfloat_val) << 16;
float f;
Expand Down Expand Up @@ -87,49 +76,6 @@ static std::string GetRiscName(CoreType core_type, int hart_id) {
return fmt::format("UNKNOWN_RISC_ID({})", hart_id);
}

static inline uint64_t GetBaseAddr(Device *device, const CoreCoord &phys_core, int hart_id) {

dprint_buf_msg_t *buf = device->get_dev_addr<dprint_buf_msg_t *>(phys_core, HalMemAddrType::DPRINT);

return reinterpret_cast<uint64_t>(buf->data[hart_id]);
}

static inline int GetNumRiscs(const CoreDescriptor &core) {
return (core.type == CoreType::ETH)? DPRINT_NRISCVS_ETH : DPRINT_NRISCVS;
}

// Helper function to get all (logical) printable cores on a device
static CoreDescriptorSet get_all_printable_cores(Device *device) {
CoreDescriptorSet all_printable_cores;
// The set of all printable cores is Tensix + Eth cores
CoreCoord logical_grid_size = device->logical_grid_size();
for (uint32_t x = 0; x < logical_grid_size.x; x++) {
for (uint32_t y = 0; y < logical_grid_size.y; y++) {
all_printable_cores.insert({{x, y}, CoreType::WORKER});
}
}
for (const auto& logical_core : device->get_active_ethernet_cores()) {
all_printable_cores.insert({logical_core, CoreType::ETH});
}
for (const auto& logical_core : device->get_inactive_ethernet_cores()) {
all_printable_cores.insert({logical_core, CoreType::ETH});
}

return all_printable_cores;
}

// Helper function to get all (logical) printable cores that are used for dispatch. Should be a subset of
// get_all_printable_cores().
static CoreDescriptorSet get_dispatch_printable_cores(Device* device) {
CoreDescriptorSet printable_dispatch_cores;
unsigned num_cqs = tt::llrt::OptionsG.get_num_hw_cqs();
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
for (auto logical_core : tt::get_logical_dispatch_cores(device->id(), num_cqs, dispatch_core_type)) {
printable_dispatch_cores.insert({logical_core, dispatch_core_type});
}
return printable_dispatch_cores;
}

// A null stream for when the print server is muted.
class NullBuffer : public std::streambuf {
public:
Expand Down Expand Up @@ -195,8 +141,6 @@ struct DebugPrintServerContext {

std::ofstream* outfile_ = nullptr; // non-cout
std::ostream* stream_ = nullptr; // either == outfile_ or is &cout
std::ofstream* noc_log_ = nullptr;
std::map<uint32_t, uint32_t> noc_xfer_counts;

// For printing each riscs dprint to a separate file, a map from {device id, core coord x, y, hard index} to files.
std::map<std::tuple<uint32_t, uint32_t, uint32_t, uint32_t>, std::ofstream *> risc_to_stream_;
Expand Down Expand Up @@ -369,7 +313,7 @@ static void PrintTypedUint32Array(ostream& stream, int setwidth, uint32_t raw_el
// Used for debug print server startup sequence.
void WriteInitMagic(Device *device, const CoreCoord& phys_core, int hart_id, bool enabled) {
// compute the buffer address for the requested hart
uint64_t base_addr = GetBaseAddr(device, phys_core, hart_id);
uint64_t base_addr = GetDprintBufAddr(device, phys_core, hart_id);

// TODO(AP): this could use a cleanup - need a different mechanism to know if a kernel is running on device.
// Force wait for first kernel launch by first writing a non-zero and waiting for a zero.
Expand All @@ -384,7 +328,7 @@ void WriteInitMagic(Device *device, const CoreCoord& phys_core, int hart_id, boo
// Note that this is not a bulletproof way to bootstrap the print server (TODO(AP))
bool CheckInitMagicCleared(Device *device, const CoreCoord& phys_core, int hart_id) {
// compute the buffer address for the requested hart
uint32_t base_addr = GetBaseAddr(device, phys_core, hart_id);
uint32_t base_addr = GetDprintBufAddr(device, phys_core, hart_id);

vector<uint32_t> initbuf = { DEBUG_PRINT_SERVER_STARTING_MAGIC };
auto result = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, base_addr, 4);
Expand Down Expand Up @@ -414,7 +358,6 @@ DebugPrintServerContext::DebugPrintServerContext() {
outfile_ = new std::ofstream(file_name);
}
stream_ = outfile_ ? outfile_ : &cout;
noc_log_ = new std::ofstream("noc_log.csv");

stop_print_server_ = false;
mute_print_server_ = false;
Expand Down Expand Up @@ -447,10 +390,6 @@ DebugPrintServerContext::~DebugPrintServerContext() {
key_and_stream.second->close();
delete key_and_stream.second;
}
for (auto &size_and_count : noc_xfer_counts)
*noc_log_ << size_and_count.first << "," << size_and_count.second << "\n";
noc_log_->close();
delete noc_log_;
inst = nullptr;
} // ~DebugPrintServerContext

Expand Down Expand Up @@ -480,15 +419,15 @@ void DebugPrintServerContext::AttachDevice(Device* device) {

// A set of all valid printable cores, used for checking the user input. Note that the coords
// here are physical.
CoreDescriptorSet all_printable_cores = get_all_printable_cores(device);
CoreDescriptorSet dispatch_printable_cores = get_dispatch_printable_cores(device);
CoreDescriptorSet all_cores = GetAllCores(device);
CoreDescriptorSet dispatch_cores = GetDispatchCores(device);

// Initialize all print buffers on all cores on the device to have print disabled magic. We
// will then write print enabled magic for only the cores the user has specified to monitor.
// This way in the kernel code (dprint.h) we can detect whether the magic value is present and
// skip prints entirely to prevent kernel code from hanging waiting for the print buffer to be
// flushed from the host.
for (auto &logical_core : all_printable_cores) {
for (auto &logical_core : all_cores) {
CoreCoord phys_core = device->physical_core_from_logical_core(logical_core);
for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) {
WriteInitMagic(device, phys_core, hart_index, false);
Expand All @@ -508,7 +447,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) {
if (tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) ==
tt::llrt::RunTimeDebugClassAll) {
// Print from all cores of the given type, cores returned here are guaranteed to be valid.
for (CoreDescriptor logical_core : all_printable_cores) {
for (CoreDescriptor logical_core : all_cores) {
if (logical_core.type == core_type)
print_cores_sanitized.push_back(logical_core);
}
Expand All @@ -520,7 +459,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) {
} else if (
tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) ==
tt::llrt::RunTimeDebugClassDispatch) {
for (CoreDescriptor logical_core : dispatch_printable_cores) {
for (CoreDescriptor logical_core : dispatch_cores) {
if (logical_core.type == core_type)
print_cores_sanitized.push_back(logical_core);
}
Expand All @@ -533,8 +472,8 @@ void DebugPrintServerContext::AttachDevice(Device* device) {
tt::llrt::OptionsG.get_feature_all_cores(tt::llrt::RunTimeDebugFeatureDprint, core_type) ==
tt::llrt::RunTimeDebugClassWorker) {
// For worker cores, take all cores and remove dispatch cores.
for (CoreDescriptor logical_core : all_printable_cores) {
if (dispatch_printable_cores.find(logical_core) == dispatch_printable_cores.end()) {
for (CoreDescriptor logical_core : all_cores) {
if (dispatch_cores.find(logical_core) == dispatch_cores.end()) {
if (logical_core.type == core_type)
print_cores_sanitized.push_back(logical_core);
}
Expand All @@ -560,7 +499,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) {
} catch (std::runtime_error& error) {
valid_logical_core = false;
}
if (valid_logical_core && all_printable_cores.count({logical_core, core_type}) > 0) {
if (valid_logical_core && all_cores.count({logical_core, core_type}) > 0) {
print_cores_sanitized.push_back({logical_core, core_type});
log_info(
tt::LogMetal,
Expand Down Expand Up @@ -632,7 +571,7 @@ void DebugPrintServerContext::DetachDevice(Device* device) {

// Check if rpos < wpos, indicating unprocessed prints.
constexpr int eightbytes = 8;
uint32_t base_addr = GetBaseAddr(device, phys_core, risc_id);
uint32_t base_addr = GetDprintBufAddr(device, phys_core, risc_id);
auto from_dev = tt::llrt::read_hex_vec_from_core(chip_id, phys_core, base_addr, eightbytes);
uint32_t wpos = from_dev[0], rpos = from_dev[1];
if (rpos < wpos) {
Expand All @@ -654,8 +593,8 @@ void DebugPrintServerContext::DetachDevice(Device* device) {
log_info(tt::LogMetal, "DPRINT Server dettached device {}", device->id());

// When detaching a device, disable prints on it.
CoreDescriptorSet all_printable_cores = get_all_printable_cores(device);
for (auto &logical_core : all_printable_cores) {
CoreDescriptorSet all_cores = GetAllCores(device);
for (auto &logical_core : all_cores) {
CoreCoord phys_core = device->physical_core_from_logical_core(logical_core);
for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) {
WriteInitMagic(device, phys_core, hart_index, false);
Expand Down Expand Up @@ -691,7 +630,7 @@ bool DebugPrintServerContext::PeekOneHartNonBlocking(
return false;

// compute the buffer address for the requested hart
uint32_t base_addr = GetBaseAddr(device, phys_core, hart_id);
uint32_t base_addr = GetDprintBufAddr(device, phys_core, hart_id);
chip_id_t chip_id = device->id();

// Device is incrementing wpos
Expand Down Expand Up @@ -824,11 +763,6 @@ bool DebugPrintServerContext::PeekOneHartNonBlocking(
stream << std::setprecision(*ptr);
TT_ASSERT(sz == 1);
break;
case DPrintNOC_LOG_XFER:
if (tt::llrt::OptionsG.get_dprint_noc_transfers())
noc_xfer_counts[*reinterpret_cast<uint32_t*>(ptr)]++;
TT_ASSERT(sz == 4);
break;
case DPrintFIXED:
stream << std::fixed;
TT_ASSERT(sz == 1);
Expand Down
Loading

0 comments on commit 595932a

Please sign in to comment.