From b5d4b970def95f6680e5da2cf17630462ad89e8b Mon Sep 17 00:00:00 2001
From: Almeet Bhullar <abhullar@tenstorrent.com>
Date: Thu, 16 May 2024 23:09:51 +0000
Subject: [PATCH] #8530: Pull static/dynamic tlb config out of tt_cluster.cpp
 and into tlb_config.cpp

---
 tt_metal/llrt/CMakeLists.txt |   1 +
 tt_metal/llrt/tlb_config.cpp | 175 +++++++++++++++++++++++++++++++++++
 tt_metal/llrt/tlb_config.hpp |  19 ++++
 tt_metal/llrt/tt_cluster.cpp | 122 +-----------------------
 tt_metal/llrt/tt_cluster.hpp |   1 -
 5 files changed, 198 insertions(+), 120 deletions(-)
 create mode 100644 tt_metal/llrt/tlb_config.cpp
 create mode 100644 tt_metal/llrt/tlb_config.hpp

diff --git a/tt_metal/llrt/CMakeLists.txt b/tt_metal/llrt/CMakeLists.txt
index 2122f16e1bf..62e097abc6a 100644
--- a/tt_metal/llrt/CMakeLists.txt
+++ b/tt_metal/llrt/CMakeLists.txt
@@ -2,6 +2,7 @@
 set(LLRT_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/llrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/rtoptions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/tlb_config.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tt_cluster.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tt_hexfile.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/tt_memory.cpp)
diff --git a/tt_metal/llrt/tlb_config.cpp b/tt_metal/llrt/tlb_config.cpp
new file mode 100644
index 00000000000..925def9d523
--- /dev/null
+++ b/tt_metal/llrt/tlb_config.cpp
@@ -0,0 +1,175 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "tlb_config.hpp"
+#include "device_data.hpp"
+
+#include "third_party/umd/device/blackhole_implementation.h"
+#include "third_party/umd/device/grayskull_implementation.h"
+#include "third_party/umd/device/wormhole_implementation.h"
+
+namespace ll_api {
+
+namespace grayskull {
+
+static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
+static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
+static constexpr unsigned int DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
+static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 0;
+static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = tt::umd::grayskull::DYNAMIC_TLB_16M_SIZE;
+
+int32_t get_static_tlb_index(CoreCoord target) {
+    // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
+    // The default 1MB TLB is not used for DRAM cores
+    // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
+    // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
+    //     return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
+    // }
+    int flat_index = target.y * DEVICE_DATA.GRID_SIZE_X + target.x;
+    if (flat_index == 0) {
+        return -1;
+    }
+    return flat_index;
+}
+
+}  // namespace grayskull
+
+namespace wormhole {
+
+static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
+static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
+static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
+static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 0;
+static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = tt::umd::wormhole::DYNAMIC_TLB_16M_SIZE;
+
+int32_t get_static_tlb_index(CoreCoord target) {
+    bool is_eth_location =
+        std::find(std::cbegin(DEVICE_DATA.ETH_LOCATIONS), std::cend(DEVICE_DATA.ETH_LOCATIONS), target) !=
+        std::cend(DEVICE_DATA.ETH_LOCATIONS);
+    bool is_tensix_location =
+        std::find(std::cbegin(DEVICE_DATA.T6_X_LOCATIONS), std::cend(DEVICE_DATA.T6_X_LOCATIONS), target.x) !=
+            std::cend(DEVICE_DATA.T6_X_LOCATIONS) &&
+        std::find(std::cbegin(DEVICE_DATA.T6_Y_LOCATIONS), std::cend(DEVICE_DATA.T6_Y_LOCATIONS), target.y) !=
+            std::cend(DEVICE_DATA.T6_Y_LOCATIONS);
+    // implementation migrated from wormhole.py in `src/t6ifc/t6py/packages/tenstorrent/chip/wormhole.py` from tensix
+    // repo (t6py-wormhole-bringup branch)
+
+    // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
+    // The default 1MB TLB is not used for DRAM cores
+    // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
+    // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
+    //     return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
+    // }
+
+    if (is_eth_location) {
+        if (target.y == 6) {
+            target.y = 1;
+        }
+
+        if (target.x >= 5) {
+            target.x -= 1;
+        }
+        target.x -= 1;
+
+        int flat_index = target.y * 8 + target.x;
+        int tlb_index = flat_index;
+        return tlb_index;
+
+    } else if (is_tensix_location) {
+        if (target.x >= 5) {
+            target.x -= 1;
+        }
+        target.x -= 1;
+
+        if (target.y >= 6) {
+            target.y -= 1;
+        }
+        target.y -= 1;
+
+        int flat_index = target.y * 8 + target.x;
+
+        // All 80 get single 1MB TLB.
+        int tlb_index = DEVICE_DATA.ETH_LOCATIONS.size() + flat_index;
+
+        return tlb_index;
+    } else {
+        return -1;
+    }
+}
+
+}  // namespace wormhole
+
+namespace blackhole {
+
+static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
+static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
+static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
+static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = tt::umd::blackhole::DYNAMIC_TLB_2M_SIZE;
+static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 0;
+
+int32_t get_static_tlb_index(CoreCoord target) {
+    return -1;
+}
+
+}  // namespace blackhole
+
+void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver) {
+    using get_static_tlb_index_ptr = std::int32_t (*)(tt_xy_pair);
+    get_static_tlb_index_ptr get_static_tlb_index;
+    uint32_t DYNAMIC_TLB_BASE_INDEX, DYNAMIC_TLB_COUNT, DYNAMIC_TLB_16M_SIZE, DYNAMIC_TLB_2M_SIZE;
+
+    switch (arch) {
+        case tt::ARCH::GRAYSKULL:
+            get_static_tlb_index = grayskull::get_static_tlb_index;
+            DYNAMIC_TLB_BASE_INDEX = grayskull::DYNAMIC_TLB_BASE_INDEX;
+            DYNAMIC_TLB_COUNT = grayskull::DYNAMIC_TLB_COUNT;
+            DYNAMIC_TLB_16M_SIZE = grayskull::DYNAMIC_TLB_16M_SIZE;
+            DYNAMIC_TLB_2M_SIZE = grayskull::DYNAMIC_TLB_2M_SIZE;
+            break;
+        case tt::ARCH::WORMHOLE:
+        case tt::ARCH::WORMHOLE_B0:
+            get_static_tlb_index = wormhole::get_static_tlb_index;
+            DYNAMIC_TLB_BASE_INDEX = wormhole::DYNAMIC_TLB_BASE_INDEX;
+            DYNAMIC_TLB_COUNT = wormhole::DYNAMIC_TLB_COUNT;
+            DYNAMIC_TLB_16M_SIZE = wormhole::DYNAMIC_TLB_16M_SIZE;
+            DYNAMIC_TLB_2M_SIZE = wormhole::DYNAMIC_TLB_2M_SIZE;
+            break;
+        case tt::ARCH::BLACKHOLE:
+            get_static_tlb_index = blackhole::get_static_tlb_index;
+            DYNAMIC_TLB_BASE_INDEX = blackhole::DYNAMIC_TLB_BASE_INDEX;
+            DYNAMIC_TLB_COUNT = blackhole::DYNAMIC_TLB_COUNT;
+            DYNAMIC_TLB_2M_SIZE = blackhole::DYNAMIC_TLB_2M_SIZE;
+            DYNAMIC_TLB_16M_SIZE = blackhole::DYNAMIC_TLB_16M_SIZE;
+            break;
+        default: TT_THROW("Configuring static TLBs is not supported for {}", tt::get_string(arch));
+    }
+
+    auto statically_mapped_cores = sdesc.workers;
+    statically_mapped_cores.insert(
+        statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end());
+    std::int32_t address = 0;
+
+    // Setup static TLBs for all worker cores
+    for (auto &core : statically_mapped_cores) {
+        auto tlb_index = get_static_tlb_index(core);
+        device_driver.configure_tlb(mmio_device_id, core, tlb_index, address);
+    }
+    // Setup static TLBs for MMIO mapped data space
+    uint64_t peer_dram_offset = DEVICE_DATA.DRAM_CHANNEL_0_PEER2PEER_REGION_START;
+    for (uint32_t tlb_id = DYNAMIC_TLB_BASE_INDEX; tlb_id < DYNAMIC_TLB_BASE_INDEX + DYNAMIC_TLB_COUNT; tlb_id++) {
+        device_driver.configure_tlb(
+            mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset);
+        // Align address space of 16MB TLB to 16MB boundary
+        peer_dram_offset += DYNAMIC_TLB_16M_SIZE;
+    }
+    device_driver.setup_core_to_tlb_map([get_static_tlb_index](CoreCoord core) { return get_static_tlb_index(core); });
+}
+
+std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config() {
+    std::unordered_map<std::string, std::int32_t> dynamic_tlb_config;
+    dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB;
+    return dynamic_tlb_config;
+}
+
+}  // namespace ll_api
diff --git a/tt_metal/llrt/tlb_config.hpp b/tt_metal/llrt/tlb_config.hpp
new file mode 100644
index 00000000000..b7b8d589f73
--- /dev/null
+++ b/tt_metal/llrt/tlb_config.hpp
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "third_party/umd/device/device_api.h"
+#include "tt_metal/common/tt_backend_api_types.hpp"
+#include "tt_metal/common/metal_soc_descriptor.h"
+
+#include <unordered_map>
+
+namespace ll_api {
+
+void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver);
+
+std::unordered_map<std::string, std::int32_t> get_dynamic_tlb_config();
+
+}  // namespace ll_api
diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp
index d350c03ca53..1e59341b9af 100644
--- a/tt_metal/llrt/tt_cluster.cpp
+++ b/tt_metal/llrt/tt_cluster.cpp
@@ -18,17 +18,7 @@
 #include "tools/profiler/profiler.hpp"
 #include "tt_metal/impl/debug/sanitize_noc_host.hpp"
 #include "tt_metal/llrt/rtoptions.hpp"
-
-#ifdef ARCH_GRAYSKULL
-static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
-static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
-static constexpr unsigned int DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
-
-#else
-static constexpr uint32_t DYNAMIC_TLB_COUNT = 16;
-static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1;
-static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1;
-#endif
+#include "tt_metal/llrt/tlb_config.hpp"
 
 namespace tt {
 
@@ -218,8 +208,7 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &c
         // Silicon driver will attempt to open this many hugepages as channels, and assert if workload uses more than available.
         // Metal currently uses assigns 1 channel per device
         uint32_t num_host_mem_ch_per_mmio_device = controlled_device_ids.size();
-        std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
-        dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB;
+        std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = ll_api::get_dynamic_tlb_config();
         // This will remove harvested rows from the soc descriptor
         const bool perform_harvesting = true;
         const bool clean_system_resources = true;
@@ -248,111 +237,6 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set<chip_id_t> &c
     this->mmio_device_id_to_driver_[mmio_device_id] = std::move(device_driver);
 }
 
-#ifdef ARCH_WORMHOLE
-std::int32_t get_static_tlb_index(CoreCoord target) {
-    bool is_eth_location =
-        std::find(std::cbegin(DEVICE_DATA.ETH_LOCATIONS), std::cend(DEVICE_DATA.ETH_LOCATIONS), target) !=
-        std::cend(DEVICE_DATA.ETH_LOCATIONS);
-    bool is_tensix_location =
-        std::find(std::cbegin(DEVICE_DATA.T6_X_LOCATIONS), std::cend(DEVICE_DATA.T6_X_LOCATIONS), target.x) !=
-            std::cend(DEVICE_DATA.T6_X_LOCATIONS) &&
-        std::find(std::cbegin(DEVICE_DATA.T6_Y_LOCATIONS), std::cend(DEVICE_DATA.T6_Y_LOCATIONS), target.y) !=
-            std::cend(DEVICE_DATA.T6_Y_LOCATIONS);
-    // implementation migrated from wormhole.py in `src/t6ifc/t6py/packages/tenstorrent/chip/wormhole.py` from tensix
-    // repo (t6py-wormhole-bringup branch)
-
-    // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
-    // The default 1MB TLB is not used for DRAM cores
-    // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
-    // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
-    //     return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
-    // }
-
-    if (is_eth_location) {
-        if (target.y == 6) {
-            target.y = 1;
-        }
-
-        if (target.x >= 5) {
-            target.x -= 1;
-        }
-        target.x -= 1;
-
-        int flat_index = target.y * 8 + target.x;
-        int tlb_index = flat_index;
-        return tlb_index;
-
-    } else if (is_tensix_location) {
-        if (target.x >= 5) {
-            target.x -= 1;
-        }
-        target.x -= 1;
-
-        if (target.y >= 6) {
-            target.y -= 1;
-        }
-        target.y -= 1;
-
-        int flat_index = target.y * 8 + target.x;
-
-        // All 80 get single 1MB TLB.
-        int tlb_index = DEVICE_DATA.ETH_LOCATIONS.size() + flat_index;
-
-        return tlb_index;
-    } else {
-        return -1;
-    }
-}
-#endif
-
-#ifdef ARCH_GRAYSKULL
-std::int32_t get_static_tlb_index(CoreCoord target) {
-    // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table
-    // The default 1MB TLB is not used for DRAM cores
-    // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target);
-    // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) {
-    //     return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin());
-    // }
-    int flat_index = target.y * DEVICE_DATA.GRID_SIZE_X + target.x;
-    if (flat_index == 0) {
-        return -1;
-    }
-    return flat_index;
-}
-#endif
-
-// TODO: pull tlb config into sep file similar to BBE
-#ifdef ARCH_BLACKHOLE
-std::int32_t get_static_tlb_index(CoreCoord target) {
-    return -1;
-}
-#endif
-
-void Cluster::configure_static_tlbs(chip_id_t mmio_device_id) const {
-    auto sdesc = get_soc_desc(mmio_device_id);
-    auto statically_mapped_cores = sdesc.workers;
-    statically_mapped_cores.insert(
-        statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end());
-    std::int32_t address = 0;
-
-    // Setup static TLBs for all worker cores
-    for (auto &core : statically_mapped_cores) {
-        auto tlb_index = get_static_tlb_index(core);
-        this->get_driver(mmio_device_id).configure_tlb(mmio_device_id, core, tlb_index, address);
-    }
-    // Setup static TLBs for MMIO mapped data space
-    uint64_t peer_dram_offset = DEVICE_DATA.DRAM_CHANNEL_0_PEER2PEER_REGION_START;
-    for (uint32_t tlb_id = DYNAMIC_TLB_BASE_INDEX; tlb_id < DYNAMIC_TLB_BASE_INDEX + DYNAMIC_TLB_COUNT; tlb_id++) {
-        this->get_driver(mmio_device_id).configure_tlb(
-            mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset);
-        // Align address space of 16MB TLB to 16MB boundary
-#ifndef ARCH_BLACKHOLE // TODO (abhullar): clean this up
-        peer_dram_offset += DEVICE_DATA.DYNAMIC_TLB_16M_SIZE;
-#endif
-    }
-    this->get_driver(mmio_device_id).setup_core_to_tlb_map([](CoreCoord core) { return get_static_tlb_index(core); });
-}
-
 void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const {
     device_params.init_device = true;
 
@@ -360,7 +244,7 @@ void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_pa
 
     // static TLBs avoided for Blackhole bring up
     if (this->target_type_ == TargetDevice::Silicon && device_params.init_device && this->arch_ != tt::ARCH::BLACKHOLE) {
-        configure_static_tlbs(mmio_device_id);
+        ll_api::configure_static_tlbs(this->arch_, mmio_device_id, this->get_soc_desc(mmio_device_id), this->get_driver(mmio_device_id));
     }
 
     this->mmio_device_id_to_driver_.at(mmio_device_id)->start_device(device_params);
diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp
index dc0a5eddf0a..a17662ffffb 100644
--- a/tt_metal/llrt/tt_cluster.hpp
+++ b/tt_metal/llrt/tt_cluster.hpp
@@ -185,7 +185,6 @@ class Cluster {
     tt_device &get_driver(chip_id_t device_id) const;
     void get_metal_desc_from_tt_desc(const std::unordered_map<chip_id_t, tt_SocDescriptor> &input, const std::unordered_map<chip_id_t, uint32_t> &per_chip_id_harvesting_masks);
     tt_cxy_pair convert_physical_cxy_to_virtual(const tt_cxy_pair &physical_cxy) const;
-    void configure_static_tlbs(chip_id_t mmio_device_id) const;
 
     // Returns map of connected chip ids to active ethernet cores
     std::unordered_map<chip_id_t, std::vector<CoreCoord>> get_ethernet_cores_grouped_by_connected_chips(