diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt index 151afff0..7eed3b49 100644 --- a/device/CMakeLists.txt +++ b/device/CMakeLists.txt @@ -26,6 +26,7 @@ target_sources( chip/local_chip.cpp chip/mock_chip.cpp chip/remote_chip.cpp + tt_device/tlb_manager.cpp cluster.cpp coordinate_manager.cpp cpuset_lib.cpp diff --git a/device/api/umd/device/architecture_implementation.h b/device/api/umd/device/architecture_implementation.h index 8efc165b..dbf78be3 100644 --- a/device/api/umd/device/architecture_implementation.h +++ b/device/api/umd/device/architecture_implementation.h @@ -65,6 +65,12 @@ class architecture_implementation { virtual const std::vector& get_t6_x_locations() const = 0; virtual const std::vector& get_t6_y_locations() const = 0; + // TLB related. Move other functions here as well. + virtual std::pair get_tlb_1m_base_and_count() const = 0; + virtual std::pair get_tlb_2m_base_and_count() const = 0; + virtual std::pair get_tlb_16m_base_and_count() const = 0; + virtual std::pair get_tlb_4g_base_and_count() const = 0; + virtual std::tuple multicast_workaround(xy_pair start, xy_pair end) const = 0; virtual tlb_configuration get_tlb_configuration(uint32_t tlb_index) const = 0; virtual std::pair get_tlb_data( diff --git a/device/api/umd/device/blackhole_implementation.h b/device/api/umd/device/blackhole_implementation.h index d2820403..0ac0c805 100644 --- a/device/api/umd/device/blackhole_implementation.h +++ b/device/api/umd/device/blackhole_implementation.h @@ -304,6 +304,18 @@ class blackhole_implementation : public architecture_implementation { const std::vector& get_t6_y_locations() const override { return blackhole::T6_Y_LOCATIONS; } + std::pair get_tlb_1m_base_and_count() const override { return {0, 0}; } + + std::pair get_tlb_2m_base_and_count() const override { + return {blackhole::TLB_BASE_2M, blackhole::TLB_COUNT_2M}; + } + + std::pair get_tlb_16m_base_and_count() const override { return {0, 0}; } + + std::pair get_tlb_4g_base_and_count() const override { + return {blackhole::TLB_BASE_4G, blackhole::TLB_COUNT_4G}; + } + std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override; std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; diff --git a/device/api/umd/device/chip/local_chip.h b/device/api/umd/device/chip/local_chip.h index 4ad58cee..b2a8e201 100644 --- a/device/api/umd/device/chip/local_chip.h +++ b/device/api/umd/device/chip/local_chip.h @@ -9,6 +9,7 @@ #include "umd/device/chip/chip.h" namespace tt::umd { + class LocalChip : public Chip { public: LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id); diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h index 6df72001..a363d018 100644 --- a/device/api/umd/device/cluster.h +++ b/device/api/umd/device/cluster.h @@ -619,6 +619,7 @@ class Cluster : public tt_device { // TODO: This should be accessible through public API, probably to be moved to tt_device. PCIDevice* get_pci_device(int device_id) const; TTDevice* get_tt_device(chip_id_t device_id) const; + TLBManager* get_tlb_manager(chip_id_t device_id) const; const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const; // Existing API we want to remove. UMD is transitioning to use CoreCoord instead of tt_xy_pair. @@ -836,15 +837,6 @@ class Cluster : public tt_device { uint32_t* return_3 = nullptr, uint32_t* return_4 = nullptr); - // TODO: These will be moved to a dedicated class for TLB management - bool address_in_tlb_space( - uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); - bool is_tlb_mapped(tt_cxy_pair target); - bool is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes); - // Note that these maps holds only entries for local PCIe chips. - std::map> tlb_config_map = {}; - std::unordered_map> map_core_to_tlb_per_chip = {}; - std::shared_ptr get_mutex(const std::string& tlb_name, int logical_device_id); virtual uint32_t get_harvested_noc_rows_for_chip( int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips @@ -914,8 +906,6 @@ class Cluster : public tt_device { std::unordered_set eth_cores = {}; std::unordered_set dram_cores = {}; - std::unordered_map dynamic_tlb_config = {}; - std::unordered_map dynamic_tlb_ordering_modes = {}; std::map, std::unordered_map>>> bcast_header_cache = {}; bool perform_harvesting_on_sdesc = false; bool use_ethernet_ordered_writes = true; diff --git a/device/api/umd/device/grayskull_implementation.h b/device/api/umd/device/grayskull_implementation.h index 2313415a..4e97a507 100644 --- a/device/api/umd/device/grayskull_implementation.h +++ b/device/api/umd/device/grayskull_implementation.h @@ -294,6 +294,20 @@ class grayskull_implementation : public architecture_implementation { const std::vector& get_t6_y_locations() const override { return grayskull::T6_Y_LOCATIONS; } + std::pair get_tlb_1m_base_and_count() const override { + return {grayskull::TLB_BASE_1M, grayskull::TLB_COUNT_1M}; + } + + std::pair get_tlb_2m_base_and_count() const override { + return {grayskull::TLB_BASE_2M, grayskull::TLB_COUNT_2M}; + } + + std::pair get_tlb_16m_base_and_count() const override { + return {grayskull::TLB_BASE_16M, grayskull::TLB_COUNT_16M}; + } + + std::pair get_tlb_4g_base_and_count() const override { return {0, 0}; } + std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override; std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; diff --git a/device/api/umd/device/tt_device/tlb_manager.h b/device/api/umd/device/tt_device/tlb_manager.h new file mode 100644 index 00000000..3ba1f3a4 --- /dev/null +++ b/device/api/umd/device/tt_device/tlb_manager.h @@ -0,0 +1,51 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include + +#include "umd/device/tt_xy_pair.h" +#include "umd/device/types/tlb.h" + +namespace tt { +class Writer; +} + +namespace tt::umd { + +class TTDevice; + +class TLBManager { +public: + TLBManager(TTDevice* tt_device); + + // TODO: Think about proper API which doesn't accept two cores. + // core should be in VIRTUAL coords, and translated_core should be in TRANSLATED coords. + void configure_tlb( + tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering); + + void set_dynamic_tlb_config(std::string fallback_tlb_name, int32_t tlb_index); + void set_dynamic_tlb_config_ordering(std::string fallback_tlb_name, uint64_t ordering); + + bool address_in_tlb_space(uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size); + bool is_tlb_mapped(tt_xy_pair core); + bool is_tlb_mapped(tt_xy_pair core, uint64_t address, uint32_t size_in_bytes); + + tt::Writer get_static_tlb_writer(tt_xy_pair core); + tlb_configuration get_tlb_configuration(tt_xy_pair core); + + // TODO: the following members will be moved to private once enough stuff is moved out of cluster. + std::unordered_map tlb_config_map_; + std::unordered_map map_core_to_tlb_; + + std::unordered_map dynamic_tlb_config_; + std::unordered_map dynamic_tlb_ordering_modes_; + +private: + TTDevice* tt_device_; +}; + +} // namespace tt::umd diff --git a/device/api/umd/device/tt_device/tt_device.h b/device/api/umd/device/tt_device/tt_device.h index 9a35c8d8..3728112d 100644 --- a/device/api/umd/device/tt_device/tt_device.h +++ b/device/api/umd/device/tt_device/tt_device.h @@ -8,6 +8,7 @@ #include "umd/device/architecture_implementation.h" #include "umd/device/pci_device.hpp" +#include "umd/device/tt_device/tlb_manager.h" // TODO: Should be moved to blackhole_architecture_implementation.h // See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h @@ -28,6 +29,8 @@ struct dynamic_tlb { namespace tt::umd { +class TLBManager; + class TTDevice { public: /** @@ -39,6 +42,9 @@ class TTDevice { architecture_implementation *get_architecture_implementation(); PCIDevice *get_pci_device(); + TLBManager *get_tlb_manager(); + + tt::ARCH get_arch(); void detect_hang_read(uint32_t data_read = c_hang_read_value); @@ -114,6 +120,7 @@ class TTDevice { protected: std::unique_ptr pci_device_; std::unique_ptr architecture_impl_; + std::unique_ptr tlb_manager_; tt::ARCH arch; bool is_hardware_hung(); diff --git a/device/api/umd/device/tt_io.hpp b/device/api/umd/device/tt_io.hpp index 174903cb..99a7f4d8 100644 --- a/device/api/umd/device/tt_io.hpp +++ b/device/api/umd/device/tt_io.hpp @@ -23,7 +23,7 @@ class Cluster; * It is the caller's responsibility to manage the lifetime of Writer objects. */ class Writer { - friend class tt::umd::Cluster; + friend class tt::umd::TLBManager; public: /** diff --git a/device/api/umd/device/wormhole_implementation.h b/device/api/umd/device/wormhole_implementation.h index 0df17669..8179614a 100644 --- a/device/api/umd/device/wormhole_implementation.h +++ b/device/api/umd/device/wormhole_implementation.h @@ -328,6 +328,20 @@ class wormhole_implementation : public architecture_implementation { const std::vector& get_t6_y_locations() const override { return wormhole::T6_Y_LOCATIONS; } + std::pair get_tlb_1m_base_and_count() const override { + return {wormhole::TLB_BASE_1M, wormhole::TLB_COUNT_1M}; + } + + std::pair get_tlb_2m_base_and_count() const override { + return {wormhole::TLB_BASE_2M, wormhole::TLB_COUNT_2M}; + } + + std::pair get_tlb_16m_base_and_count() const override { + return {wormhole::TLB_BASE_16M, wormhole::TLB_COUNT_16M}; + } + + std::pair get_tlb_4g_base_and_count() const override { return {0, 0}; } + std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override; std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index e56a23d6..1bed106c 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -6,14 +6,26 @@ #include "umd/device/chip/local_chip.h" +#include "umd/device/tt_device/tlb_manager.h" #include "umd/device/tt_device/tt_device.h" namespace tt::umd { LocalChip::LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id) : - Chip(soc_descriptor), tt_device_(TTDevice::create(pci_device_id)) {} + Chip(soc_descriptor), tt_device_(TTDevice::create(pci_device_id)) { + auto tlb_manager = tt_device_->get_tlb_manager(); + // Setup default dynamic tlbs. + tlb_manager->set_dynamic_tlb_config( + "LARGE_READ_TLB", tt_device_->get_architecture_implementation()->get_mem_large_read_tlb()); + tlb_manager->set_dynamic_tlb_config( + "LARGE_WRITE_TLB", tt_device_->get_architecture_implementation()->get_mem_large_write_tlb()); + tlb_manager->set_dynamic_tlb_config("REG_TLB", tt_device_->get_architecture_implementation()->get_reg_tlb()); + tlb_manager->set_dynamic_tlb_config( + "SMALL_READ_WRITE_TLB", tt_device_->get_architecture_implementation()->get_small_read_write_tlb()); +} TTDevice* LocalChip::get_tt_device() { return tt_device_.get(); } bool LocalChip::is_mmio_capable() const { return true; } + } // namespace tt::umd diff --git a/device/cluster.cpp b/device/cluster.cpp index a592fe67..c7c7f629 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -46,6 +46,7 @@ #include "umd/device/hugepage.h" #include "umd/device/tt_cluster_descriptor.h" #include "umd/device/tt_core_coordinates.h" +#include "umd/device/tt_device/tlb_manager.h" #include "umd/device/tt_soc_descriptor.h" #include "umd/device/types/arch.h" #include "umd/device/types/tlb.h" @@ -143,41 +144,6 @@ std::unordered_map Cluster::get_virtual_soc_descrip return soc_descs; } -bool Cluster::address_in_tlb_space( - uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { - const auto& tlb_map = tlb_config_map.at(chip); - const auto it = tlb_map.find(tlb_index); - if (it != tlb_map.end()) { - auto mapped_address = it->second; - return address >= mapped_address && (address + size_in_bytes <= mapped_address + tlb_size); - } - return false; -} - -bool Cluster::is_tlb_mapped(tt_cxy_pair target) { - if (map_core_to_tlb_per_chip.find(target.chip) == map_core_to_tlb_per_chip.end()) { - return false; - } - - auto& map_core_to_tlb = map_core_to_tlb_per_chip.at(target.chip); - tt_xy_pair target_core = tt_xy_pair(target.x, target.y); - - return map_core_to_tlb.find(target_core) != map_core_to_tlb.end(); -} - -bool Cluster::is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes) { - if (!is_tlb_mapped(target)) { - return false; - } - - auto* dev = get_tt_device(target.chip); - - int32_t tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)); - tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration(tlb_index); - - return address_in_tlb_space(address, size_in_bytes, tlb_index, tlb_description.size, target.chip); -} - void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm) { // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here // (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm @@ -191,7 +157,7 @@ void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanu std::string mutex_name = ""; // Initialize Dynamic TLB mutexes - for (auto& tlb : dynamic_tlb_config) { + for (auto& tlb : get_tlb_manager(logical_device_id)->dynamic_tlb_config_) { mutex_name = tlb.first + std::to_string(logical_device_id); if (cleanup_mutexes_in_shm) { named_mutex::remove(mutex_name.c_str()); @@ -266,6 +232,7 @@ void Cluster::create_device( pci_device->get_device_num(), pci_device->revision_id); + // TODO: This will be moved to a dedicated Locking class. initialize_interprocess_mutexes(logical_device_id, clean_system_resources); // MT: Initial BH - hugepages will fail init @@ -330,18 +297,6 @@ void Cluster::construct_cluster( perform_harvesting_on_sdesc = perform_harvesting; - // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and - // writes. - auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); - dynamic_tlb_config["LARGE_WRITE_TLB"] = architecture_implementation->get_mem_large_write_tlb(); - dynamic_tlb_config["REG_TLB"] = architecture_implementation->get_reg_tlb(); - dynamic_tlb_config["SMALL_READ_WRITE_TLB"] = architecture_implementation->get_small_read_write_tlb(); - - // All dynamic TLBs use Relaxed Ordering by default - for (const auto& tlb : dynamic_tlb_config) { - dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); - } create_device(local_chip_ids_, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources); // MT: Initial BH - Disable dependency to ethernet firmware @@ -471,20 +426,21 @@ void Cluster::construct_cluster( } } + auto any_architecture_implementation = get_tt_device(*local_chip_ids_.begin())->get_architecture_implementation(); // Default initialize l1_address_params based on detected arch - l1_address_params = architecture_implementation->get_l1_address_params(); + l1_address_params = any_architecture_implementation->get_l1_address_params(); // Default initialize dram_address_params. dram_address_params = {0u}; // Default initialize host_address_params based on detected arch - host_address_params = architecture_implementation->get_host_address_params(); + host_address_params = any_architecture_implementation->get_host_address_params(); // Default initialize eth_interface_params based on detected arch - eth_interface_params = architecture_implementation->get_eth_interface_params(); + eth_interface_params = any_architecture_implementation->get_eth_interface_params(); // Default initialize noc_params based on detected arch - noc_params = architecture_implementation->get_noc_params(); + noc_params = any_architecture_implementation->get_noc_params(); } std::unique_ptr Cluster::construct_chip_from_cluster( @@ -1075,24 +1031,7 @@ std::function Cluster::get_fast_pcie_s } tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) { - if (!cluster_desc->is_chip_mmio_capable(target.chip)) { - throw std::runtime_error(fmt::format("Target not in MMIO chip: {}", target.str())); - } - - if (!is_tlb_mapped(target)) { - throw std::runtime_error(fmt::format("TLBs not initialized for core: {}", target.str())); - } - - auto* dev = get_tt_device(target.chip); - if (!dev->get_pci_device()->bar0_wc) { - throw std::runtime_error("No write-combined mapping for BAR0"); - } - - auto tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)); - tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration(tlb_index); - - auto* base = reinterpret_cast(dev->get_pci_device()->bar0_wc); - return tt::Writer(base + tlb_description.tlb_offset, tlb_description.size); + return get_tlb_manager(target.chip)->get_static_tlb_writer({target.x, target.y}); } tt::Writer Cluster::get_static_tlb_writer(const chip_id_t chip, const CoreCoord target) { @@ -1119,9 +1058,8 @@ void Cluster::write_device_memory( size_in_bytes, small_access); - if (is_tlb_mapped(target, address, size_in_bytes)) { - tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration( - map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y))); + if (get_tlb_manager(target.chip)->is_tlb_mapped({target.x, target.y}, address, size_in_bytes)) { + tlb_configuration tlb_description = get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); if (dev->get_pci_device()->bar4_wc != nullptr && tlb_description.size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset // to which we write so write_block knows it needs to target BAR4 @@ -1133,7 +1071,7 @@ void Cluster::write_device_memory( dev->write_block(tlb_description.tlb_offset + address % tlb_description.size, size_in_bytes, buffer_addr); } } else { - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(target.chip)->dynamic_tlb_config_.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, target.chip)); while (size_in_bytes > 0) { @@ -1141,7 +1079,7 @@ void Cluster::write_device_memory( tlb_index, harvested_coord_translation.at(target.chip).at(target), address, - dynamic_tlb_ordering_modes.at(fallback_tlb)); + get_tlb_manager(target.chip)->dynamic_tlb_ordering_modes_.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->write_block(mapped_address, transfer_size, buffer_addr); @@ -1168,9 +1106,8 @@ void Cluster::read_device_memory( log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value()); - if (is_tlb_mapped(target, address, size_in_bytes)) { - tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration( - map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y))); + if (get_tlb_manager(target.chip)->is_tlb_mapped({target.x, target.y}, address, size_in_bytes)) { + tlb_configuration tlb_description = get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); if (dev->get_pci_device()->bar4_wc != nullptr && tlb_description.size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset // from which we read so read_block knows it needs to target BAR4 @@ -1187,7 +1124,7 @@ void Cluster::read_device_memory( tlb_description.tlb_offset, tlb_description.size); } else { - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(target.chip)->dynamic_tlb_config_.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, target.chip)); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); while (size_in_bytes > 0) { @@ -1195,7 +1132,7 @@ void Cluster::read_device_memory( tlb_index, harvested_coord_translation.at(target.chip).at(target), address, - dynamic_tlb_ordering_modes.at(fallback_tlb)); + get_tlb_manager(target.chip)->dynamic_tlb_ordering_modes_.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->read_block(mapped_address, transfer_size, buffer_addr); @@ -1339,21 +1276,15 @@ Cluster::~Cluster() { cleanup_shared_host_state(); cluster_desc.reset(); - dynamic_tlb_config.clear(); - tlb_config_map.clear(); - dynamic_tlb_ordering_modes.clear(); } std::optional> Cluster::get_tlb_data_from_target(const tt_cxy_pair& target) { - auto tlb_configuration = get_tlb_configuration(target); + tlb_configuration tlb_configuration = get_tlb_configuration(target); return std::tuple((uint32_t)tlb_configuration.tlb_offset, (uint32_t)tlb_configuration.size); } tlb_configuration Cluster::get_tlb_configuration(const tt_cxy_pair& target) { - log_assert(is_tlb_mapped(target), "TLB not mapped for core: {}", target.str()); - - int tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)); - return get_tt_device(target.chip)->get_architecture_implementation()->get_tlb_configuration(tlb_index); + return get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); } std::optional> Cluster::get_tlb_data_from_target(const chip_id_t chip, CoreCoord core) { @@ -1368,32 +1299,8 @@ tlb_configuration Cluster::get_tlb_configuration(const chip_id_t chip, CoreCoord void Cluster::configure_tlb( chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { - log_assert( - ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, - "Invalid ordering specified in Cluster::configure_tlb"); - if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) { - tlb_config_map.insert({logical_device_id, {}}); - map_core_to_tlb_per_chip.insert({logical_device_id, {}}); - } - log_debug( - LogSiliconDriver, - "Configuring TLB for chip: {} core: {} tlb_index: {} address: {} ordering: {}", - logical_device_id, - core.str(), - tlb_index, - address, - ordering); - log_assert( - tlb_config_map.at(logical_device_id).find(tlb_index) == tlb_config_map.at(logical_device_id).end(), - "TLB index already configured {}", - tlb_index); - - TTDevice* tt_device = get_tt_device(logical_device_id); - tt_device->set_dynamic_tlb( - tlb_index, harvested_coord_translation.at(logical_device_id).at(core), address, ordering); - uint64_t tlb_size = tt_device->get_architecture_implementation()->get_tlb_configuration(tlb_index).size; - tlb_config_map.at(logical_device_id).insert({tlb_index, (address / tlb_size) * tlb_size}); - map_core_to_tlb_per_chip.at(logical_device_id).insert({core, tlb_index}); + get_tlb_manager(logical_device_id) + ->configure_tlb(core, harvested_coord_translation.at(logical_device_id).at(core), tlb_index, address, ordering); } void Cluster::configure_tlb( @@ -1403,16 +1310,9 @@ void Cluster::configure_tlb( } void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) { - log_assert( - ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, - "Invalid ordering specified in Cluster::configure_tlb."); - log_assert( - dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), - "Invalid TLB specified in Cluster::set_fallback_tlb_ordering_mode."); - log_assert( - fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", - "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); - dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; + for (auto& chip_id : local_chip_ids_) { + get_tlb_manager(chip_id)->set_dynamic_tlb_config_ordering(fallback_tlb, ordering); + } } // TODO: this is in the wrong place, it should be in the TTDevice. @@ -1758,14 +1658,20 @@ void* Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, u } } -// Wrapper for throwing more helpful exception when not-enabled pci intf is accessed. +// Wrapper for throwing a more helpful exception when trying to access non pci enabled interface. inline TTDevice* Cluster::get_tt_device(chip_id_t device_id) const { - log_assert(chips_.find(device_id) != chips_.end(), "Device id {} not found in cluster.", device_id); - auto tt_device = chips_.at(device_id)->get_tt_device(); + auto chip_it = chips_.find(device_id); + log_assert(chip_it != chips_.end(), "Device id {} not found in cluster.", device_id); + auto tt_device = chip_it->second->get_tt_device(); log_assert(tt_device != nullptr, "TTDevice not found for device: {}", device_id); return tt_device; } +// Wrapper for throwing a more helpful exception when trying to access non pci enabled interface. +inline TLBManager* Cluster::get_tlb_manager(chip_id_t device_id) const { + return get_tt_device(device_id)->get_tlb_manager(); +} + std::shared_ptr Cluster::get_mutex( const std::string& tlb_name, int logical_device_id) { std::string mutex_name = tlb_name + std::to_string(logical_device_id); @@ -2556,7 +2462,7 @@ void Cluster::pcie_broadcast_write( // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet // Broadcast for WH. TTDevice* tt_device = get_tt_device(chip); - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(chip)->dynamic_tlb_config_.at(fallback_tlb); const uint8_t* buffer_addr = static_cast(mem_ptr); const scoped_lock lock(*get_mutex(fallback_tlb, chip)); while (size_in_bytes > 0) { @@ -2565,7 +2471,7 @@ void Cluster::pcie_broadcast_write( addr, harvested_coord_translation.at(chip).at(start), harvested_coord_translation.at(chip).at(end), - dynamic_tlb_ordering_modes.at(fallback_tlb)); + get_tlb_manager(chip)->dynamic_tlb_ordering_modes_.at(fallback_tlb)); uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); tt_device->write_block(mapped_address, transfer_size, buffer_addr); @@ -3071,7 +2977,7 @@ void Cluster::read_mmio_device_register( void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { TTDevice* tt_device = get_tt_device(core.chip); - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(core.chip)->dynamic_tlb_config_.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, core.chip)); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); @@ -3091,7 +2997,7 @@ void Cluster::write_mmio_device_register( const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { TTDevice* tt_device = get_tt_device(core.chip); - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(core.chip)->dynamic_tlb_config_.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, core.chip)); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); diff --git a/device/tt_device/tlb_manager.cpp b/device/tt_device/tlb_manager.cpp new file mode 100644 index 00000000..9f5da461 --- /dev/null +++ b/device/tt_device/tlb_manager.cpp @@ -0,0 +1,110 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "umd/device/tt_device/tlb_manager.h" + +#include "logger.hpp" +#include "umd/device/tt_device/tt_device.h" +#include "umd/device/tt_io.hpp" +#include "umd/device/types/tlb.h" + +namespace tt::umd { + +static constexpr uint64_t DEFAULT_ORDERING_MODE = tlb_data::Relaxed; + +TLBManager::TLBManager(TTDevice* tt_device) : tt_device_(tt_device) {} + +void TLBManager::configure_tlb( + tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering) { + log_assert( + ordering == tlb_data::Strict || ordering == tlb_data::Posted || ordering == tlb_data::Relaxed, + "Invalid ordering specified in Cluster::configure_tlb"); + log_debug( + LogSiliconDriver, + "Configuring TLB for chip: {} core: {} tlb_index: {} address: {} ordering: {}", + logical_device_id, + core.str(), + tlb_index, + address, + ordering); + log_assert(tlb_config_map_.find(tlb_index) == tlb_config_map_.end(), "TLB index already configured {}", tlb_index); + + tt_device_->set_dynamic_tlb(tlb_index, translated_core, address, ordering); + auto tlb_size = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index).size; + tlb_config_map_.insert({tlb_index, (address / tlb_size) * tlb_size}); + map_core_to_tlb_.insert({core, tlb_index}); +} + +void TLBManager::set_dynamic_tlb_config(std::string fallback_tlb_name, int32_t tlb_index) { + log_assert( + dynamic_tlb_config_.find(fallback_tlb_name) == dynamic_tlb_config_.end(), + "Dynamic TLB already configured for {}", + fallback_tlb_name); + dynamic_tlb_config_.insert({fallback_tlb_name, tlb_index}); + dynamic_tlb_ordering_modes_[fallback_tlb_name] = DEFAULT_ORDERING_MODE; +} + +void TLBManager::set_dynamic_tlb_config_ordering(std::string fallback_tlb_name, uint64_t ordering) { + log_assert( + ordering == tlb_data::Strict || ordering == tlb_data::Posted || ordering == tlb_data::Relaxed, + "Invalid ordering specified in set_dynamic_tlb_config_ordering."); + log_assert( + fallback_tlb_name != "LARGE_READ_TLB" && fallback_tlb_name != "LARGE_WRITE_TLB", + "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); + log_assert( + dynamic_tlb_config_.find(fallback_tlb_name) != dynamic_tlb_config_.end(), + "Dynamic TLB not configured {}", + fallback_tlb_name); + + dynamic_tlb_ordering_modes_[fallback_tlb_name] = ordering; +} + +bool TLBManager::address_in_tlb_space(uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size) { + if (tlb_config_map_.find(tlb_index) != tlb_config_map_.end()) { + auto mapped_address = tlb_config_map_.at(tlb_index); + return address >= mapped_address && (address + size_in_bytes <= mapped_address + tlb_size); + } + return false; +} + +bool TLBManager::is_tlb_mapped(tt_xy_pair core) { return map_core_to_tlb_.find(core) != map_core_to_tlb_.end(); } + +bool TLBManager::is_tlb_mapped(tt_xy_pair core, uint64_t address, uint32_t size_in_bytes) { + if (!is_tlb_mapped(core)) { + return false; + } + + int32_t tlb_index = map_core_to_tlb_.at(core); + tlb_configuration tlb_description = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index); + + return address_in_tlb_space(address, size_in_bytes, tlb_index, tlb_description.size); +} + +tt::Writer TLBManager::get_static_tlb_writer(tt_xy_pair core) { + if (!is_tlb_mapped(core)) { + throw std::runtime_error(fmt::format("TLBs not initialized for core: {}", core.str())); + } + + if (!tt_device_->get_pci_device()->bar0_wc) { + throw std::runtime_error("No write-combined mapping for BAR0"); + } + + auto tlb_index = map_core_to_tlb_.at(core); + auto tlb_data = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index); + + auto* base = reinterpret_cast(tt_device_->get_pci_device()->bar0_wc); + + return tt::Writer(base + tlb_data.tlb_offset, tlb_data.size); +} + +tlb_configuration TLBManager::get_tlb_configuration(tt_xy_pair core) { + log_assert(is_tlb_mapped(core), "TLB not mapped for core: {}", core.str()); + + int tlb_index = map_core_to_tlb_.at(core); + return tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index); +} + +}; // namespace tt::umd diff --git a/device/tt_device/tt_device.cpp b/device/tt_device/tt_device.cpp index 8f74bad9..0f3d5db3 100644 --- a/device/tt_device/tt_device.cpp +++ b/device/tt_device/tt_device.cpp @@ -15,6 +15,7 @@ TTDevice::TTDevice( std::unique_ptr pci_device, std::unique_ptr architecture_impl) : pci_device_(std::move(pci_device)), architecture_impl_(std::move(architecture_impl)), + tlb_manager_(std::make_unique(this)), arch(architecture_impl_->get_architecture()) {} /* static */ std::unique_ptr TTDevice::create(int pci_device_number) { @@ -36,6 +37,10 @@ architecture_implementation *TTDevice::get_architecture_implementation() { retur PCIDevice *TTDevice::get_pci_device() { return pci_device_.get(); } +TLBManager *TTDevice::get_tlb_manager() { return tlb_manager_.get(); } + +tt::ARCH TTDevice::get_arch() { return arch; } + bool TTDevice::is_hardware_hung() { volatile const void *addr = reinterpret_cast(pci_device_->bar0_uc) + (architecture_impl_->get_arc_reset_scratch_offset() + 6 * 4) - diff --git a/tests/api/CMakeLists.txt b/tests/api/CMakeLists.txt index 8d12a1ae..eb189b9b 100644 --- a/tests/api/CMakeLists.txt +++ b/tests/api/CMakeLists.txt @@ -7,6 +7,7 @@ set(API_TESTS_SRCS test_core_coord_translation_bh.cpp test_mockup_device.cpp test_soc_descriptor.cpp + test_tlb_manager.cpp ) add_executable(api_tests ${API_TESTS_SRCS}) diff --git a/tests/api/test_tlb_manager.cpp b/tests/api/test_tlb_manager.cpp new file mode 100644 index 00000000..b9103fb0 --- /dev/null +++ b/tests/api/test_tlb_manager.cpp @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// This file holds Chip specific API examples. + +#include + +#include "umd/device/tt_device/tt_device.h" +#include "umd/device/tt_io.hpp" +#include "umd/device/tt_soc_descriptor.h" + +using namespace tt::umd; + +std::unique_ptr get_tt_device() { + std::vector pci_device_ids = PCIDevice::enumerate_devices(); + if (pci_device_ids.empty()) { + return nullptr; + } + return TTDevice::create(pci_device_ids[0]); +} + +// TODO: Once default auto TLB setup is in, check it is setup properly. +TEST(ApiTLBManager, ManualTLBConfiguration) { + std::unique_ptr tt_device = get_tt_device(); + + if (tt_device == nullptr) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + + TLBManager* tlb_manager = tt_device->get_tlb_manager(); + tt_SocDescriptor soc_desc = tt_SocDescriptor::get_soc_descriptor_path(tt_device->get_arch()); + + // TODO: This should be part of TTDevice interface, not Cluster or Chip. + // Configure TLBs. + std::function get_static_tlb_index = [&](tt_xy_pair core) -> int { + // TODO: Make this per arch. + bool is_worker_core = soc_desc.is_worker_core(core); + if (!is_worker_core) { + return -1; + } + + auto tlb_index = core.x + core.y * tt_device->get_architecture_implementation()->get_grid_size_x(); + + auto tlb_1m_base_and_count = tt_device->get_architecture_implementation()->get_tlb_1m_base_and_count(); + auto tlb_2m_base_and_count = tt_device->get_architecture_implementation()->get_tlb_2m_base_and_count(); + + // Use either 1mb or 2mb tlbs. + if (tlb_1m_base_and_count.second > 0) { + // Expect that tlb index is within the number of 1mb TLBs. + EXPECT_TRUE(tlb_index < tlb_1m_base_and_count.second); + tlb_index += tlb_1m_base_and_count.first; + } else { + // Expect that tlb index is within the number of 1mb TLBs. + EXPECT_TRUE(tlb_index < tlb_2m_base_and_count.second); + tlb_index += tlb_2m_base_and_count.first; + } + + return tlb_index; + }; + + std::int32_t c_zero_address = 0; + + for (tt_xy_pair core : soc_desc.get_cores(CoreType::TENSIX)) { + tlb_manager->configure_tlb(core, core, get_static_tlb_index(core), c_zero_address, tlb_data::Relaxed); + } + + // So now that we have configured TLBs we can use it to interface with the TTDevice. + auto any_worker_core = soc_desc.get_cores(CoreType::TENSIX)[0]; + tlb_configuration tlb_description = tlb_manager->get_tlb_configuration(any_worker_core); + + // TODO: Maybe accept tlb_index only? + uint64_t address_l1_to_write = 0; + std::vector buffer_to_write = {0x01, 0x02, 0x03, 0x04}; + tt_device->write_block( + tlb_description.tlb_offset + address_l1_to_write, buffer_to_write.size(), buffer_to_write.data()); + + // Another way to write to the TLB. + // TODO: This should be converted to AbstractIO writer. + tt::Writer writer = tlb_manager->get_static_tlb_writer(any_worker_core); + writer.write(address_l1_to_write, buffer_to_write[0]); +}