From 4a4aa10a61a8f033c0f61ce57f3525048b8f3e48 Mon Sep 17 00:00:00 2001 From: Almeet Bhullar Date: Sat, 14 Dec 2024 01:15:44 +0000 Subject: [PATCH] Add option to enable BH hw cache invalidation and keep it default disabled --- README.md | 2 +- .../BlackholeBringUpProgrammingGuide.md | 11 ++++---- tt_metal/hw/firmware/src/brisc.cc | 2 +- tt_metal/hw/firmware/src/idle_erisc.cc | 2 +- tt_metal/hw/firmware/src/ncrisc.cc | 2 +- tt_metal/hw/firmware/src/slave_idle_erisc.cc | 2 +- tt_metal/hw/firmware/src/trisc.cc | 2 +- tt_metal/hw/inc/risc_common.h | 26 +++++++++++++------ tt_metal/jit_build/build.cpp | 4 +++ tt_metal/llrt/rtoptions.cpp | 2 ++ tt_metal/llrt/rtoptions.hpp | 6 +++++ 11 files changed, 42 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 5ad22f0b7c5..d33ed5665d6 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ Get started with [simple kernels](https://docs.tenstorrent.com/tt-metalium/lates - [CNNs on TT Architectures](./tech_reports/CNNs/ttcnn.md) (updated Sept 6th) - [Ethernet and Multichip Basics](./tech_reports/EthernetMultichip/BasicEthernetGuide.md) (Updated Sept 20th) - [Collective Communication Library (CCL)](./tech_reports/EthernetMultichip/CclDeveloperGuide.md) (Updated Sept 20th) -- [Blackhole Bring-Up Programming Guide](./tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md) (Updated Oct 30th) +- [Blackhole Bring-Up Programming Guide](./tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md) (Updated Dec 18th) ## TT-Metalium Programming Examples diff --git a/tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md b/tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md index 9086695e9e6..85e0c7c53e4 100644 --- a/tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md +++ b/tech_reports/Blackhole/BlackholeBringUpProgrammingGuide.md @@ -66,20 +66,21 @@ Information relevant to programming Blackhole while it is being brought up. ### L1 Data Cache -Blackhole added a data cache in L1. Writing an address on one core and reading it from another only requires the reader to invalidate if the address was previously read. +Blackhole added a small (4 x 16B cachelines) write-through data cache in L1. Writing an address on one core and reading it from another only requires the reader to invalidate if the address was previously read. -Invalidating the cache can be done via calls to `invalidate_l1_cache()` +Invalidating the cache can be done via calls to `invalidate_l1_cache()`. Hardware can clear the cache at some randomized time interval but this is slower than explicitly invalidating the cache. By default the hardware timeout is disabled but can be enabled by setting env var `TT_METAL_ENABLE_HW_CACHE_INVALIDATION` The cache can be disabled through an env var: ``` export TT_METAL_DISABLE_L1_DATA_CACHE_RISCVS= ``` +where the values specify which riscs to disable cache on. ### Ethernet Cores -Runtime has not enabled access to second RISC-V on the ethernet cores yet. +Runtime has enabled access to second RISC-V on idle ethernet cores. -Fast dispatch can be run out of ethernet cores. +Support for Fast Dispatch out of idle ethernet cores is added but temporarily disabled while bringing up multi-chip ethernet support. ### DRAM @@ -87,7 +88,7 @@ Runtime has not enabled access to program RISC-V on DRAM yet. ### NoC -Non-rectangular multicast shapes have not been tested yet. +Non-rectangular multicast shapes and strided multicast has been brought up and tested. See gtest `DispatchFixture.DRAMtoL1MulticastExcludeRegionUpLeft` for example on usage. On previous architectures there are instances in kernels where NoC commands are issued without explicit flushes. These were causing ND mismatches or hangs on BH because data and semaphore signals were getting updated faster than NoC has a chance to service the command and are resolved by adding flushes. Previous architectures did not need this because of higher RISC to L1 latency compared to NoC latency. diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index e66e6351189..fbb36d3831c 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -345,7 +345,7 @@ inline void wait_ncrisc_trisc() { } int main() { - conditionally_disable_l1_cache(); + configure_l1_data_cache(); DIRTY_STACK_MEMORY(); WAYPOINT("I"); diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index 8577150848a..473625f3faa 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -102,7 +102,7 @@ inline void wait_slave_eriscs(uint32_t &heartbeat) { } int main() { - conditionally_disable_l1_cache(); + configure_l1_data_cache(); DIRTY_STACK_MEMORY(); WAYPOINT("I"); do_crt1((uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH); diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index 15691b7756e..f79713cc239 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -81,7 +81,7 @@ inline __attribute__((always_inline)) void signal_ncrisc_completion() { } int main(int argc, char *argv[]) { - conditionally_disable_l1_cache(); + configure_l1_data_cache(); DIRTY_STACK_MEMORY(); WAYPOINT("I"); diff --git a/tt_metal/hw/firmware/src/slave_idle_erisc.cc b/tt_metal/hw/firmware/src/slave_idle_erisc.cc index a7b8f52e8ef..aca818dd0d0 100644 --- a/tt_metal/hw/firmware/src/slave_idle_erisc.cc +++ b/tt_metal/hw/firmware/src/slave_idle_erisc.cc @@ -51,7 +51,7 @@ inline __attribute__((always_inline)) void signal_slave_idle_erisc_completion() } int main(int argc, char *argv[]) { - conditionally_disable_l1_cache(); + configure_l1_data_cache(); DIRTY_STACK_MEMORY(); WAYPOINT("I"); do_crt1((uint32_t *)MEM_SLAVE_IERISC_INIT_LOCAL_L1_BASE_SCRATCH); diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index dc1d7865bcb..9a268bf096e 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -76,7 +76,7 @@ constexpr bool cb_init_write = false; using namespace ckernel; int main(int argc, char *argv[]) { - conditionally_disable_l1_cache(); + configure_l1_data_cache(); DIRTY_STACK_MEMORY(); WAYPOINT("I"); diff --git a/tt_metal/hw/inc/risc_common.h b/tt_metal/hw/inc/risc_common.h index 4ebe6630846..c082861582c 100644 --- a/tt_metal/hw/inc/risc_common.h +++ b/tt_metal/hw/inc/risc_common.h @@ -176,14 +176,12 @@ inline __attribute__((always_inline)) void invalidate_l1_cache() { #endif } -// Disables Blackhole's L1 cache. Grayskull and Wormhole do not have L1 cache -// L1 cache can be disabled by setting `TT_METAL_DISABLE_L1_DATA_CACHE_RISCVS` env var -// export TT_METAL_DISABLE_L1_DATA_CACHE_RISCVS= -inline __attribute__((always_inline)) void conditionally_disable_l1_cache() { -#if defined(ARCH_BLACKHOLE) && defined(DISABLE_L1_DATA_CACHE) - // asm(R"ASM( - // csrrsi zero, 0x7c0, 0x8 - // )ASM"); +inline __attribute__((always_inline)) void configure_l1_data_cache() { +#if defined(ARCH_BLACKHOLE) +#if defined(DISABLE_L1_DATA_CACHE) + // Disables Blackhole's L1 cache. Grayskull and Wormhole do not have L1 cache + // L1 cache can be disabled by setting `TT_METAL_DISABLE_L1_DATA_CACHE_RISCVS` env var + // export TT_METAL_DISABLE_L1_DATA_CACHE_RISCVS= asm(R"ASM( .option push li t1, 0x1 @@ -192,6 +190,18 @@ inline __attribute__((always_inline)) void conditionally_disable_l1_cache() { .option pop )ASM" :: : "t1"); +#elif !defined(ENABLE_HW_CACHE_INVALIDATION) + // Disable gathering to stop HW from invalidating the data cache after 128 transactions + // This is default enabled + asm(R"ASM( + .option push + li t1, 0x1 + slli t1, t1, 18 + csrrs zero, 0x7c0, t1 + .option pop + )ASM" :: + : "t1"); +#endif #endif } diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 8b5c30591b0..6e494b11948 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -156,6 +156,10 @@ void JitBuildEnv::init( " "; } + if (tt::llrt::RunTimeOptions::get_instance().get_hw_cache_invalidation_enabled()) { + this->defines_ += "-DENABLE_HW_CACHE_INVALIDATION "; + } + // Includes // TODO(pgk) this list is insane this->includes_ = string("") + "-I. " + "-I.. " + "-I" + this->root_ + " " + "-I" + this->root_ + "tt_metal " + diff --git a/tt_metal/llrt/rtoptions.cpp b/tt_metal/llrt/rtoptions.cpp index 741872c1eaf..f75a84b7320 100644 --- a/tt_metal/llrt/rtoptions.cpp +++ b/tt_metal/llrt/rtoptions.cpp @@ -126,6 +126,8 @@ RunTimeOptions::RunTimeOptions() { if (getenv("TT_METAL_SKIP_DELETING_BUILT_CACHE")) { this->skip_deleting_built_cache = true; } + + this->enable_hw_cache_invalidation = (std::getenv("TT_METAL_ENABLE_HW_CACHE_INVALIDATION") != nullptr); } const std::string& RunTimeOptions::get_root_dir() { diff --git a/tt_metal/llrt/rtoptions.hpp b/tt_metal/llrt/rtoptions.hpp index 99233ea51dc..a391d53d736 100644 --- a/tt_metal/llrt/rtoptions.hpp +++ b/tt_metal/llrt/rtoptions.hpp @@ -125,6 +125,10 @@ class RunTimeOptions { bool enable_dispatch_data_collection = false; + // HW can clear Blackhole's L1 data cache psuedo-randomly once every 128 transactions + // This option will enable this feature to help flush out whether there is a missing cache invalidation + bool enable_hw_cache_invalidation = false; + tt_metal::DispatchCoreConfig dispatch_core_config = tt_metal::DispatchCoreConfig{}; bool skip_deleting_built_cache = false; @@ -297,6 +301,8 @@ class RunTimeOptions { inline bool get_dispatch_data_collection_enabled() { return enable_dispatch_data_collection; } inline void set_dispatch_data_collection_enabled(bool enable) { enable_dispatch_data_collection = enable; } + inline bool get_hw_cache_invalidation_enabled() const { return this->enable_hw_cache_invalidation; } + inline tt_metal::DispatchCoreConfig get_dispatch_core_config() { return dispatch_core_config; } inline bool get_skip_deleting_built_cache() { return skip_deleting_built_cache; }