From b09f62804db6b099ee9e3fb974fa7a7f6290a473 Mon Sep 17 00:00:00 2001 From: Almeet Bhullar Date: Thu, 16 May 2024 23:09:51 +0000 Subject: [PATCH] #8530: Pull static/dynamic tlb config out of tt_cluster.cpp and into tlb_config.cpp --- tt_metal/llrt/CMakeLists.txt | 1 + tt_metal/llrt/tlb_config.cpp | 175 +++++++++++++++++++++++++++++++++++ tt_metal/llrt/tlb_config.hpp | 19 ++++ tt_metal/llrt/tt_cluster.cpp | 122 +----------------------- tt_metal/llrt/tt_cluster.hpp | 1 - 5 files changed, 198 insertions(+), 120 deletions(-) create mode 100644 tt_metal/llrt/tlb_config.cpp create mode 100644 tt_metal/llrt/tlb_config.hpp diff --git a/tt_metal/llrt/CMakeLists.txt b/tt_metal/llrt/CMakeLists.txt index 2122f16e1bf..62e097abc6a 100644 --- a/tt_metal/llrt/CMakeLists.txt +++ b/tt_metal/llrt/CMakeLists.txt @@ -2,6 +2,7 @@ set(LLRT_SRC ${CMAKE_CURRENT_SOURCE_DIR}/llrt.cpp ${CMAKE_CURRENT_SOURCE_DIR}/rtoptions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tlb_config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tt_cluster.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tt_hexfile.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tt_memory.cpp) diff --git a/tt_metal/llrt/tlb_config.cpp b/tt_metal/llrt/tlb_config.cpp new file mode 100644 index 00000000000..925def9d523 --- /dev/null +++ b/tt_metal/llrt/tlb_config.cpp @@ -0,0 +1,175 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "tlb_config.hpp" +#include "device_data.hpp" + +#include "third_party/umd/device/blackhole_implementation.h" +#include "third_party/umd/device/grayskull_implementation.h" +#include "third_party/umd/device/wormhole_implementation.h" + +namespace ll_api { + +namespace grayskull { + +static constexpr uint32_t DYNAMIC_TLB_COUNT = 16; +static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1; +static constexpr unsigned int DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1; +static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 0; +static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = tt::umd::grayskull::DYNAMIC_TLB_16M_SIZE; + +int32_t get_static_tlb_index(CoreCoord target) { + // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table + // The default 1MB TLB is not used for DRAM cores + // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target); + // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) { + // return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin()); + // } + int flat_index = target.y * DEVICE_DATA.GRID_SIZE_X + target.x; + if (flat_index == 0) { + return -1; + } + return flat_index; +} + +} // namespace grayskull + +namespace wormhole { + +static constexpr uint32_t DYNAMIC_TLB_COUNT = 16; +static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1; +static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1; +static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 0; +static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = tt::umd::wormhole::DYNAMIC_TLB_16M_SIZE; + +int32_t get_static_tlb_index(CoreCoord target) { + bool is_eth_location = + std::find(std::cbegin(DEVICE_DATA.ETH_LOCATIONS), std::cend(DEVICE_DATA.ETH_LOCATIONS), target) != + std::cend(DEVICE_DATA.ETH_LOCATIONS); + bool is_tensix_location = + std::find(std::cbegin(DEVICE_DATA.T6_X_LOCATIONS), std::cend(DEVICE_DATA.T6_X_LOCATIONS), target.x) != + std::cend(DEVICE_DATA.T6_X_LOCATIONS) && + std::find(std::cbegin(DEVICE_DATA.T6_Y_LOCATIONS), std::cend(DEVICE_DATA.T6_Y_LOCATIONS), target.y) != + std::cend(DEVICE_DATA.T6_Y_LOCATIONS); + // implementation migrated from wormhole.py in `src/t6ifc/t6py/packages/tenstorrent/chip/wormhole.py` from tensix + // repo (t6py-wormhole-bringup branch) + + // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table + // The default 1MB TLB is not used for DRAM cores + // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target); + // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) { + // return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin()); + // } + + if (is_eth_location) { + if (target.y == 6) { + target.y = 1; + } + + if (target.x >= 5) { + target.x -= 1; + } + target.x -= 1; + + int flat_index = target.y * 8 + target.x; + int tlb_index = flat_index; + return tlb_index; + + } else if (is_tensix_location) { + if (target.x >= 5) { + target.x -= 1; + } + target.x -= 1; + + if (target.y >= 6) { + target.y -= 1; + } + target.y -= 1; + + int flat_index = target.y * 8 + target.x; + + // All 80 get single 1MB TLB. + int tlb_index = DEVICE_DATA.ETH_LOCATIONS.size() + flat_index; + + return tlb_index; + } else { + return -1; + } +} + +} // namespace wormhole + +namespace blackhole { + +static constexpr uint32_t DYNAMIC_TLB_COUNT = 16; +static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1; +static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1; +static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = tt::umd::blackhole::DYNAMIC_TLB_2M_SIZE; +static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 0; + +int32_t get_static_tlb_index(CoreCoord target) { + return -1; +} + +} // namespace blackhole + +void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver) { + using get_static_tlb_index_ptr = std::int32_t (*)(tt_xy_pair); + get_static_tlb_index_ptr get_static_tlb_index; + uint32_t DYNAMIC_TLB_BASE_INDEX, DYNAMIC_TLB_COUNT, DYNAMIC_TLB_16M_SIZE, DYNAMIC_TLB_2M_SIZE; + + switch (arch) { + case tt::ARCH::GRAYSKULL: + get_static_tlb_index = grayskull::get_static_tlb_index; + DYNAMIC_TLB_BASE_INDEX = grayskull::DYNAMIC_TLB_BASE_INDEX; + DYNAMIC_TLB_COUNT = grayskull::DYNAMIC_TLB_COUNT; + DYNAMIC_TLB_16M_SIZE = grayskull::DYNAMIC_TLB_16M_SIZE; + DYNAMIC_TLB_2M_SIZE = grayskull::DYNAMIC_TLB_2M_SIZE; + break; + case tt::ARCH::WORMHOLE: + case tt::ARCH::WORMHOLE_B0: + get_static_tlb_index = wormhole::get_static_tlb_index; + DYNAMIC_TLB_BASE_INDEX = wormhole::DYNAMIC_TLB_BASE_INDEX; + DYNAMIC_TLB_COUNT = wormhole::DYNAMIC_TLB_COUNT; + DYNAMIC_TLB_16M_SIZE = wormhole::DYNAMIC_TLB_16M_SIZE; + DYNAMIC_TLB_2M_SIZE = wormhole::DYNAMIC_TLB_2M_SIZE; + break; + case tt::ARCH::BLACKHOLE: + get_static_tlb_index = blackhole::get_static_tlb_index; + DYNAMIC_TLB_BASE_INDEX = blackhole::DYNAMIC_TLB_BASE_INDEX; + DYNAMIC_TLB_COUNT = blackhole::DYNAMIC_TLB_COUNT; + DYNAMIC_TLB_2M_SIZE = blackhole::DYNAMIC_TLB_2M_SIZE; + DYNAMIC_TLB_16M_SIZE = blackhole::DYNAMIC_TLB_16M_SIZE; + break; + default: TT_THROW("Configuring static TLBs is not supported for {}", tt::get_string(arch)); + } + + auto statically_mapped_cores = sdesc.workers; + statically_mapped_cores.insert( + statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end()); + std::int32_t address = 0; + + // Setup static TLBs for all worker cores + for (auto &core : statically_mapped_cores) { + auto tlb_index = get_static_tlb_index(core); + device_driver.configure_tlb(mmio_device_id, core, tlb_index, address); + } + // Setup static TLBs for MMIO mapped data space + uint64_t peer_dram_offset = DEVICE_DATA.DRAM_CHANNEL_0_PEER2PEER_REGION_START; + for (uint32_t tlb_id = DYNAMIC_TLB_BASE_INDEX; tlb_id < DYNAMIC_TLB_BASE_INDEX + DYNAMIC_TLB_COUNT; tlb_id++) { + device_driver.configure_tlb( + mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset); + // Align address space of 16MB TLB to 16MB boundary + peer_dram_offset += DYNAMIC_TLB_16M_SIZE; + } + device_driver.setup_core_to_tlb_map([get_static_tlb_index](CoreCoord core) { return get_static_tlb_index(core); }); +} + +std::unordered_map get_dynamic_tlb_config() { + std::unordered_map dynamic_tlb_config; + dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB; + return dynamic_tlb_config; +} + +} // namespace ll_api diff --git a/tt_metal/llrt/tlb_config.hpp b/tt_metal/llrt/tlb_config.hpp new file mode 100644 index 00000000000..b7b8d589f73 --- /dev/null +++ b/tt_metal/llrt/tlb_config.hpp @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "third_party/umd/device/device_api.h" +#include "tt_metal/common/tt_backend_api_types.hpp" +#include "tt_metal/common/metal_soc_descriptor.h" + +#include + +namespace ll_api { + +void configure_static_tlbs(tt::ARCH arch, chip_id_t mmio_device_id, const metal_SocDescriptor &sdesc, tt_device &device_driver); + +std::unordered_map get_dynamic_tlb_config(); + +} // namespace ll_api diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index d350c03ca53..1e59341b9af 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -18,17 +18,7 @@ #include "tools/profiler/profiler.hpp" #include "tt_metal/impl/debug/sanitize_noc_host.hpp" #include "tt_metal/llrt/rtoptions.hpp" - -#ifdef ARCH_GRAYSKULL -static constexpr uint32_t DYNAMIC_TLB_COUNT = 16; -static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1; -static constexpr unsigned int DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1; - -#else -static constexpr uint32_t DYNAMIC_TLB_COUNT = 16; -static constexpr unsigned int MEM_SMALL_READ_WRITE_TLB = DEVICE_DATA.TLB_BASE_INDEX_2M + 1; -static constexpr uint32_t DYNAMIC_TLB_BASE_INDEX = DEVICE_DATA.MEM_LARGE_READ_TLB + 1; -#endif +#include "tt_metal/llrt/tlb_config.hpp" namespace tt { @@ -218,8 +208,7 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set &c // Silicon driver will attempt to open this many hugepages as channels, and assert if workload uses more than available. // Metal currently uses assigns 1 channel per device uint32_t num_host_mem_ch_per_mmio_device = controlled_device_ids.size(); - std::unordered_map dynamic_tlb_config = {}; - dynamic_tlb_config["REG_TLB"] = DEVICE_DATA.REG_TLB; + std::unordered_map dynamic_tlb_config = ll_api::get_dynamic_tlb_config(); // This will remove harvested rows from the soc descriptor const bool perform_harvesting = true; const bool clean_system_resources = true; @@ -248,111 +237,6 @@ void Cluster::open_driver(chip_id_t mmio_device_id, const std::set &c this->mmio_device_id_to_driver_[mmio_device_id] = std::move(device_driver); } -#ifdef ARCH_WORMHOLE -std::int32_t get_static_tlb_index(CoreCoord target) { - bool is_eth_location = - std::find(std::cbegin(DEVICE_DATA.ETH_LOCATIONS), std::cend(DEVICE_DATA.ETH_LOCATIONS), target) != - std::cend(DEVICE_DATA.ETH_LOCATIONS); - bool is_tensix_location = - std::find(std::cbegin(DEVICE_DATA.T6_X_LOCATIONS), std::cend(DEVICE_DATA.T6_X_LOCATIONS), target.x) != - std::cend(DEVICE_DATA.T6_X_LOCATIONS) && - std::find(std::cbegin(DEVICE_DATA.T6_Y_LOCATIONS), std::cend(DEVICE_DATA.T6_Y_LOCATIONS), target.y) != - std::cend(DEVICE_DATA.T6_Y_LOCATIONS); - // implementation migrated from wormhole.py in `src/t6ifc/t6py/packages/tenstorrent/chip/wormhole.py` from tensix - // repo (t6py-wormhole-bringup branch) - - // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table - // The default 1MB TLB is not used for DRAM cores - // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target); - // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) { - // return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin()); - // } - - if (is_eth_location) { - if (target.y == 6) { - target.y = 1; - } - - if (target.x >= 5) { - target.x -= 1; - } - target.x -= 1; - - int flat_index = target.y * 8 + target.x; - int tlb_index = flat_index; - return tlb_index; - - } else if (is_tensix_location) { - if (target.x >= 5) { - target.x -= 1; - } - target.x -= 1; - - if (target.y >= 6) { - target.y -= 1; - } - target.y -= 1; - - int flat_index = target.y * 8 + target.x; - - // All 80 get single 1MB TLB. - int tlb_index = DEVICE_DATA.ETH_LOCATIONS.size() + flat_index; - - return tlb_index; - } else { - return -1; - } -} -#endif - -#ifdef ARCH_GRAYSKULL -std::int32_t get_static_tlb_index(CoreCoord target) { - // Special handling for DRAM TLBs : return a 2MB TLB pointing to the start of the Epoch Cmd Queue Table - // The default 1MB TLB is not used for DRAM cores - // auto DRAM_TLB_IDX = std::find(DEVICE_DATA.DRAM_LOCATIONS.begin(), DEVICE_DATA.DRAM_LOCATIONS.end(), target); - // if (DRAM_TLB_IDX != DEVICE_DATA.DRAM_LOCATIONS.end()) { - // return EPOCH_CMD_QUEUE_TLBS.at(DRAM_TLB_IDX - DEVICE_DATA.DRAM_LOCATIONS.begin()); - // } - int flat_index = target.y * DEVICE_DATA.GRID_SIZE_X + target.x; - if (flat_index == 0) { - return -1; - } - return flat_index; -} -#endif - -// TODO: pull tlb config into sep file similar to BBE -#ifdef ARCH_BLACKHOLE -std::int32_t get_static_tlb_index(CoreCoord target) { - return -1; -} -#endif - -void Cluster::configure_static_tlbs(chip_id_t mmio_device_id) const { - auto sdesc = get_soc_desc(mmio_device_id); - auto statically_mapped_cores = sdesc.workers; - statically_mapped_cores.insert( - statically_mapped_cores.end(), sdesc.ethernet_cores.begin(), sdesc.ethernet_cores.end()); - std::int32_t address = 0; - - // Setup static TLBs for all worker cores - for (auto &core : statically_mapped_cores) { - auto tlb_index = get_static_tlb_index(core); - this->get_driver(mmio_device_id).configure_tlb(mmio_device_id, core, tlb_index, address); - } - // Setup static TLBs for MMIO mapped data space - uint64_t peer_dram_offset = DEVICE_DATA.DRAM_CHANNEL_0_PEER2PEER_REGION_START; - for (uint32_t tlb_id = DYNAMIC_TLB_BASE_INDEX; tlb_id < DYNAMIC_TLB_BASE_INDEX + DYNAMIC_TLB_COUNT; tlb_id++) { - this->get_driver(mmio_device_id).configure_tlb( - mmio_device_id, CoreCoord(DEVICE_DATA.DRAM_CHANNEL_0_X, DEVICE_DATA.DRAM_CHANNEL_0_Y), tlb_id, peer_dram_offset); - // Align address space of 16MB TLB to 16MB boundary -#ifndef ARCH_BLACKHOLE // TODO (abhullar): clean this up - peer_dram_offset += DEVICE_DATA.DYNAMIC_TLB_16M_SIZE; -#endif - } - this->get_driver(mmio_device_id).setup_core_to_tlb_map([](CoreCoord core) { return get_static_tlb_index(core); }); -} - void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_params) const { device_params.init_device = true; @@ -360,7 +244,7 @@ void Cluster::start_driver(chip_id_t mmio_device_id, tt_device_params &device_pa // static TLBs avoided for Blackhole bring up if (this->target_type_ == TargetDevice::Silicon && device_params.init_device && this->arch_ != tt::ARCH::BLACKHOLE) { - configure_static_tlbs(mmio_device_id); + ll_api::configure_static_tlbs(this->arch_, mmio_device_id, this->get_soc_desc(mmio_device_id), this->get_driver(mmio_device_id)); } this->mmio_device_id_to_driver_.at(mmio_device_id)->start_device(device_params); diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index dc0a5eddf0a..a17662ffffb 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -185,7 +185,6 @@ class Cluster { tt_device &get_driver(chip_id_t device_id) const; void get_metal_desc_from_tt_desc(const std::unordered_map &input, const std::unordered_map &per_chip_id_harvesting_masks); tt_cxy_pair convert_physical_cxy_to_virtual(const tt_cxy_pair &physical_cxy) const; - void configure_static_tlbs(chip_id_t mmio_device_id) const; // Returns map of connected chip ids to active ethernet cores std::unordered_map> get_ethernet_cores_grouped_by_connected_chips(