From e891e9896ddfc33cc2257f83eac79f17be8bed48 Mon Sep 17 00:00:00 2001 From: Nick Smith <127986401+nsmithtt@users.noreply.github.com> Date: Mon, 16 Sep 2024 08:26:55 -0700 Subject: [PATCH] Better kernel debug support (#691) - Introduce debug singleton object used to hold debug environment information during runtime. This could be a good place to store new kinds of debug flags in the future for runtime. - Wire it up through ttrt. - loadKernelsFromDisk added to denote reloading of generated kernels from /tmp, from previous run, instead of loading them from the flatbuffer. - Name the kernels with program and location info from the MLIR graph. Now names look like /tmp/ttmlir_multiply_%5_tensix.cpp ^ ^ ^ Func name -/ | | | | Result Value Loc -/ | | Thread type/info [noc, eth, tensix] -/ --- runtime/CMakeLists.txt | 8 ++ runtime/include/tt/runtime/detail/debug.h | 41 ++++++++++ runtime/lib/CMakeLists.txt | 4 +- runtime/lib/common/CMakeLists.txt | 6 ++ runtime/lib/common/debug.cpp | 18 ++++ runtime/lib/ttmetal/command_queue.cpp | 82 +++++++++++++++++-- runtime/tools/python/CMakeLists.txt | 1 + runtime/tools/python/setup.py | 9 +- runtime/tools/python/ttrt/common/api.py | 11 +++ runtime/tools/python/ttrt/runtime/__init__.py | 1 + runtime/tools/python/ttrt/runtime/module.cpp | 11 +++ 11 files changed, 183 insertions(+), 9 deletions(-) create mode 100644 runtime/include/tt/runtime/detail/debug.h create mode 100644 runtime/lib/common/debug.cpp diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index b42168d83..0dd1cd8d9 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -2,6 +2,14 @@ option(TTMLIR_ENABLE_RUNTIME_TESTS "Enable runtime tests" OFF) option(TT_RUNTIME_ENABLE_TTNN "Enable TTNN Runtime" ON) option(TT_RUNTIME_ENABLE_TTMETAL "Enable TTMetal Runtime" ON) +option(TT_RUNTIME_DEBUG "Enable debug tools in runtime" OFF) + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(TT_RUNTIME_DEBUG ON) +endif() +if (TT_RUNTIME_DEBUG) + add_definitions(-DTT_RUNTIME_DEBUG) +endif() add_subdirectory(lib) add_subdirectory(tools) diff --git a/runtime/include/tt/runtime/detail/debug.h b/runtime/include/tt/runtime/detail/debug.h new file mode 100644 index 000000000..7d98beced --- /dev/null +++ b/runtime/include/tt/runtime/detail/debug.h @@ -0,0 +1,41 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TT_RUNTIME_DETAIL_DEBUG_H +#define TT_RUNTIME_DETAIL_DEBUG_H + +#include + +namespace tt::runtime::debug { + +struct Env { +#if defined(TT_RUNTIME_DEBUG) && TT_RUNTIME_DEBUG == 1 + static Env const & +#else + constexpr static Env +#endif + get(bool loadKernelsFromDisk = false) +#if defined(TT_RUNTIME_DEBUG) && TT_RUNTIME_DEBUG == 1 + ; +#else + { + return Env(false); + } +#endif + + bool loadKernelsFromDisk; + +private: + constexpr Env(bool loadKernelsFromDisk) + : loadKernelsFromDisk(loadKernelsFromDisk) {} +}; + +inline std::ostream &operator<<(std::ostream &os, Env const &env) { + os << "Env{loadKernelsFromDisk=" << env.loadKernelsFromDisk << "}"; + return os; +} + +} // namespace tt::runtime::debug + +#endif // TT_RUNTIME_DETAIL_DEBUG_H diff --git a/runtime/lib/CMakeLists.txt b/runtime/lib/CMakeLists.txt index 1792f24bf..d1fbe83a2 100644 --- a/runtime/lib/CMakeLists.txt +++ b/runtime/lib/CMakeLists.txt @@ -28,6 +28,7 @@ if (TTMLIR_ENABLE_RUNTIME AND (TT_RUNTIME_ENABLE_TTNN OR TT_RUNTIME_ENABLE_TTMET add_subdirectory(common) else() add_library(TTRuntimeSysDesc INTERFACE) + add_library(TTRuntimeDebug INTERFACE) endif() add_library(TTRuntime STATIC runtime.cpp) @@ -51,6 +52,7 @@ target_link_libraries(TTRuntime TTRuntimeSysDesc TTRuntimeTTNN TTRuntimeTTMetal + TTRuntimeDebug ) -add_dependencies(TTRuntime TTBinary TTRuntimeSysDesc FBS_GENERATION) +add_dependencies(TTRuntime TTBinary TTRuntimeSysDesc TTRuntimeDebug FBS_GENERATION) diff --git a/runtime/lib/common/CMakeLists.txt b/runtime/lib/common/CMakeLists.txt index 05c386393..96a2ae73e 100644 --- a/runtime/lib/common/CMakeLists.txt +++ b/runtime/lib/common/CMakeLists.txt @@ -7,3 +7,9 @@ target_include_directories(TTRuntimeSysDesc ) target_include_directories(TTRuntimeSysDesc PUBLIC "$") add_dependencies(TTRuntimeSysDesc tt-metal FBS_GENERATION) + +add_library(TTRuntimeDebug STATIC debug.cpp) +target_include_directories(TTRuntimeDebug + PUBLIC + ${PROJECT_SOURCE_DIR}/runtime/include +) diff --git a/runtime/lib/common/debug.cpp b/runtime/lib/common/debug.cpp new file mode 100644 index 000000000..b07d60901 --- /dev/null +++ b/runtime/lib/common/debug.cpp @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tt/runtime/detail/debug.h" + +#if defined(TT_RUNTIME_DEBUG) && TT_RUNTIME_DEBUG == 1 + +namespace tt::runtime::debug { + +Env const &Env::get(bool loadKernelsFromDisk) { + static Env config(loadKernelsFromDisk); + return config; +} + +} // namespace tt::runtime::debug + +#endif diff --git a/runtime/lib/ttmetal/command_queue.cpp b/runtime/lib/ttmetal/command_queue.cpp index 95c7d6c4a..eae647cc4 100644 --- a/runtime/lib/ttmetal/command_queue.cpp +++ b/runtime/lib/ttmetal/command_queue.cpp @@ -4,6 +4,7 @@ #include +#include "tt/runtime/detail/debug.h" #include "tt/runtime/detail/ttmetal.h" #include "tt/runtime/runtime.h" #include "tt/runtime/utils.h" @@ -21,6 +22,7 @@ struct CQExecutor { std::unordered_map> events; ::tt::tt_metal::CommandQueue *cq; + char const *currentProgramName; CQExecutor(::tt::tt_metal::Device *device, std::size_t cq_id, std::vector const &inputs, @@ -29,7 +31,8 @@ struct CQExecutor { std::shared_ptr<::tt::tt_metal::Event> execute(::tt::target::metal::CommandQueue const *commandQueue); void execute(::tt::target::metal::Command const *command); - void execute(::tt::target::metal::EnqueueProgramCommand const *command); + void execute(::tt::target::metal::EnqueueProgramCommand const *command, + char const *debugInfo); void execute(::tt::target::metal::EnqueueWriteBufferCommand const *command); void execute(::tt::target::metal::EnqueueReadBufferCommand const *command); void execute(::tt::target::metal::CreateBufferCommand const *command); @@ -64,6 +67,8 @@ CQExecutor::CQExecutor(::tt::tt_metal::Device *device, std::size_t cq_id, std::shared_ptr<::tt::tt_metal::Event> CQExecutor::execute(::tt::target::metal::CommandQueue const *commandQueue) { + currentProgramName = commandQueue->name()->c_str(); + for (auto const &event : initEvents) { ::tt::tt_metal::EnqueueWaitForEvent(*cq, event); } @@ -83,7 +88,8 @@ CQExecutor::execute(::tt::target::metal::CommandQueue const *commandQueue) { void CQExecutor::execute(::tt::target::metal::Command const *command) { switch (command->type_type()) { case ::tt::target::metal::CommandType::EnqueueProgramCommand: { - execute(command->type_as_EnqueueProgramCommand()); + execute(command->type_as_EnqueueProgramCommand(), + command->debug_info()->c_str()); break; } case ::tt::target::metal::CommandType::EnqueueWriteBufferCommand: { @@ -132,8 +138,73 @@ void CQExecutor::execute(::tt::target::metal::Command const *command) { } } +static char const * +kernelSourceTypeString(::tt::target::metal::KernelSource const *kernelSource) { + switch (kernelSource->config_type()) { + case ::tt::target::metal::KernelConfig::NONE: { + break; + } + case ::tt::target::metal::KernelConfig::NocConfig: { + switch (kernelSource->config_as_NocConfig()->noc_index()) { + case tt::target::metal::NocIndex::Noc0: { + return "noc0"; + } + case tt::target::metal::NocIndex::Noc1: { + return "noc1"; + } + } + } + case ::tt::target::metal::KernelConfig::EthernetConfig: { + switch (kernelSource->config_as_EthernetConfig()->eth_type()) { + case tt::target::metal::EthType::Sender: { + return "ethSender"; + } + case tt::target::metal::EthType::Receiver: { + return "ethReceiver"; + } + } + } + case ::tt::target::metal::KernelConfig::TensixConfig: { + return "tensix"; + } + } + return "unknown"; +} + +static std::string parseLocFromDebugInfo(char const *programDebugInfo) { + if (!programDebugInfo) { + static int gUnknownId = 0; + return std::string("%unknown") + std::to_string(gUnknownId++); + } + std::string debugInfo(programDebugInfo); + std::size_t pos = debugInfo.find_first_of(' '); + if (pos == std::string::npos) { + return debugInfo; + } + return debugInfo.substr(0, pos); +} + +static std::string createKernelFilePath( + char const *currentProgramName, char const *programDebugInfo, + ::tt::target::metal::KernelSource const *kernelSource, + char const *prefix = "/tmp/ttmlir_", char const *extention = ".cpp") { + std::string path(prefix); + path += currentProgramName; + path += "_"; + path += parseLocFromDebugInfo(programDebugInfo); + path += "_"; + path += kernelSourceTypeString(kernelSource); + path += extention; + return path; +} + static void writeFile(std::string const &fileName, char const *data, std::size_t size) { + if (debug::Env::get().loadKernelsFromDisk) { + std::ifstream file(fileName); + assert(file.is_open() && "Kernel file not found"); + return; + } std::ofstream file(fileName); file.write(data, size); file.close(); @@ -313,9 +384,8 @@ static void processRuntimeArgs( } void CQExecutor::execute( - ::tt::target::metal::EnqueueProgramCommand const *command) { - static int gKernelId = 0; - + ::tt::target::metal::EnqueueProgramCommand const *command, + char const *debugInfo) { ::tt::tt_metal::Program program = ::tt::tt_metal::CreateProgram(); for (::tt::target::metal::KernelDesc const *kernelDesc : @@ -326,7 +396,7 @@ void CQExecutor::execute( // We need a new API to create a kernel from source string, or directly from // binary std::string fileName = - "/tmp/ttmlir_" + std::to_string(gKernelId++) + ".cpp"; + createKernelFilePath(currentProgramName, debugInfo, kernelSource); writeFile(fileName, kernelSource->source()->c_str(), kernelSource->source()->size()); CoreRangeSet coreRange = toCoreRangeSet(kernelDesc->core_range_set()); diff --git a/runtime/tools/python/CMakeLists.txt b/runtime/tools/python/CMakeLists.txt index e58bec029..4ee29ec19 100644 --- a/runtime/tools/python/CMakeLists.txt +++ b/runtime/tools/python/CMakeLists.txt @@ -10,6 +10,7 @@ add_custom_target(ttrt TT_RUNTIME_ENABLE_TTNN=${TT_RUNTIME_ENABLE_TTNN} TT_RUNTIME_ENABLE_TTMETAL=${TT_RUNTIME_ENABLE_TTMETAL} TT_RUNTIME_ENABLE_PERF_TRACE=${TT_RUNTIME_ENABLE_PERF_TRACE} + TT_RUNTIME_DEBUG=${TT_RUNTIME_DEBUG} TTMLIR_VERSION_MAJOR=${TTMLIR_VERSION_MAJOR} TTMLIR_VERSION_MINOR=${TTMLIR_VERSION_MINOR} TTMLIR_VERSION_PATCH=${TTMLIR_VERSION_PATCH} diff --git a/runtime/tools/python/setup.py b/runtime/tools/python/setup.py index 6f3f701a3..98ed63833 100644 --- a/runtime/tools/python/setup.py +++ b/runtime/tools/python/setup.py @@ -27,6 +27,7 @@ enable_ttnn = os.environ.get("TT_RUNTIME_ENABLE_TTNN", "OFF") == "ON" enable_ttmetal = os.environ.get("TT_RUNTIME_ENABLE_TTMETAL", "OFF") == "ON" enable_perf = os.environ.get("TT_RUNTIME_ENABLE_PERF_TRACE", "OFF") == "ON" +debug_runtime = os.environ.get("TT_RUNTIME_DEBUG", "OFF") == "ON" ext_modules = [ Pybind11Extension( @@ -65,6 +66,7 @@ if enable_ttnn or enable_ttmetal: runlibs += ["libdevice.so", "libnng.so.1", "libuv.so.1"] + linklibs += ["TTRuntimeDebug"] if enable_perf: runlibs += ["libtracy.so.0.10.0"] @@ -139,7 +141,7 @@ def package_files(directory): paths = [] - for (path, directories, filenames) in os.walk(directory): + for path, directories, filenames in os.walk(directory): for filename in filenames: paths.append(os.path.join("..", path, filename)) return paths @@ -182,7 +184,10 @@ def package_files(directory): f"{src_dir}/build/runtime/tools/python/ttrt/runtime", f"{metaldir}/lib", ], - define_macros=[("VERSION_INFO", __version__)], + define_macros=[ + ("VERSION_INFO", __version__), + ("TT_RUNTIME_DEBUG", "1" if debug_runtime else "0"), + ], ) ) diff --git a/runtime/tools/python/ttrt/common/api.py b/runtime/tools/python/ttrt/common/api.py index 0e536606e..3a12f2652 100644 --- a/runtime/tools/python/ttrt/common/api.py +++ b/runtime/tools/python/ttrt/common/api.py @@ -212,6 +212,14 @@ def initialize_apis(): help="--save-artifacts flag must be set, provides a directory path to save artifacts to", api_only=False, ) + API.Run.register_arg( + name="--load-kernels-from-disk", + type=bool, + default=False, + choices=[True, False], + help="Pickup the kernels from disk (/tmp) instead of the flatbuffer", + api_only=False, + ) # register all perf arguments API.Perf.register_arg( @@ -838,6 +846,9 @@ def _execute(binaries): self.logging.warning(f"no binaries found to run - returning early") return + debug_env = ttrt.runtime.DebugEnv.get(self.load_kernels_from_disk) + self.logging.debug(f"setting tt runtime debug env={debug_env}") + self.logging.debug(f"setting torch manual seed={self['seed']}") torch.manual_seed(self["seed"]) ttrt.runtime.set_compatible_runtime(binaries[0].fbb) diff --git a/runtime/tools/python/ttrt/runtime/__init__.py b/runtime/tools/python/ttrt/runtime/__init__.py index 981315d2f..74800e057 100644 --- a/runtime/tools/python/ttrt/runtime/__init__.py +++ b/runtime/tools/python/ttrt/runtime/__init__.py @@ -9,6 +9,7 @@ Tensor, DataType, DeviceRuntime, + DebugEnv, get_current_runtime, set_compatible_runtime, get_current_system_desc, diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp index 0aa5d84db..8d8f6601d 100644 --- a/runtime/tools/python/ttrt/runtime/module.cpp +++ b/runtime/tools/python/ttrt/runtime/module.cpp @@ -2,6 +2,9 @@ // // SPDX-License-Identifier: Apache-2.0 +#include + +#include "tt/runtime/detail/debug.h" #include "tt/runtime/runtime.h" #include @@ -64,4 +67,12 @@ PYBIND11_MODULE(_C, m) { py::arg("executable"), py::arg("program_index"), py::arg("inputs"), py::arg("outputs"), "Submit a binary for execution"); m.def("wait", &tt::runtime::wait, py::arg("event")); + + py::class_(m, "DebugEnv") + .def_static("get", &tt::runtime::debug::Env::get) + .def("__str__", [](const tt::runtime::debug::Env &env) { + std::stringstream os; + os << env; + return os.str(); + }); }