From 66898acfed710d2e3233dbce2d5b422bbc6607e0 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 18 Dec 2024 08:45:30 +0000 Subject: [PATCH 01/17] wip --- device/api/umd/device/chip/chip.h | 2 ++ device/api/umd/device/chip/local_chip.h | 9 ++++++ device/api/umd/device/cluster.h | 3 -- device/chip/chip.cpp | 2 ++ device/chip/local_chip.cpp | 1 + device/chip/tlb_manager.cpp | 43 +++++++++++++++++++++++++ device/chip/tlb_manager.h | 25 ++++++++++++++ device/cluster.cpp | 27 +--------------- 8 files changed, 83 insertions(+), 29 deletions(-) create mode 100644 device/chip/tlb_manager.cpp create mode 100644 device/chip/tlb_manager.h diff --git a/device/api/umd/device/chip/chip.h b/device/api/umd/device/chip/chip.h index aa9a0151..77cfb6e3 100644 --- a/device/api/umd/device/chip/chip.h +++ b/device/api/umd/device/chip/chip.h @@ -12,6 +12,7 @@ namespace tt::umd { class TTDevice; +class TLBManager; // An abstract class that represents a chip. class Chip { @@ -23,6 +24,7 @@ class Chip { tt_SocDescriptor& get_soc_descriptor(); virtual TTDevice* get_tt_device(); + virtual TLBManager* get_tlb_manager(); virtual bool is_mmio_capable() const = 0; diff --git a/device/api/umd/device/chip/local_chip.h b/device/api/umd/device/chip/local_chip.h index 4ad58cee..2f06ccd5 100644 --- a/device/api/umd/device/chip/local_chip.h +++ b/device/api/umd/device/chip/local_chip.h @@ -9,6 +9,9 @@ #include "umd/device/chip/chip.h" namespace tt::umd { + +class TLBManager; + class LocalChip : public Chip { public: LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id); @@ -17,7 +20,13 @@ class LocalChip : public Chip { bool is_mmio_capable() const override; + // TLB related functions + // void configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering); + + // TODO: remove this function once write and read is moved to chip class. + private: std::unique_ptr tt_device_; + std::unique_ptr tlb_manager_; }; } // namespace tt::umd diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h index 6df72001..d9b30f4f 100644 --- a/device/api/umd/device/cluster.h +++ b/device/api/umd/device/cluster.h @@ -841,9 +841,6 @@ class Cluster : public tt_device { uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); bool is_tlb_mapped(tt_cxy_pair target); bool is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes); - // Note that these maps holds only entries for local PCIe chips. - std::map> tlb_config_map = {}; - std::unordered_map> map_core_to_tlb_per_chip = {}; std::shared_ptr get_mutex(const std::string& tlb_name, int logical_device_id); virtual uint32_t get_harvested_noc_rows_for_chip( diff --git a/device/chip/chip.cpp b/device/chip/chip.cpp index 9f995f77..5b28b3e1 100644 --- a/device/chip/chip.cpp +++ b/device/chip/chip.cpp @@ -14,4 +14,6 @@ tt_SocDescriptor& Chip::get_soc_descriptor() { return soc_descriptor_; } TTDevice* Chip::get_tt_device() { return nullptr; } +TLBManager* Chip::get_tlb_manager() { return nullptr; } + } // namespace tt::umd diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index e56a23d6..179145e3 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -16,4 +16,5 @@ LocalChip::LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id) : TTDevice* LocalChip::get_tt_device() { return tt_device_.get(); } bool LocalChip::is_mmio_capable() const { return true; } + } // namespace tt::umd diff --git a/device/chip/tlb_manager.cpp b/device/chip/tlb_manager.cpp new file mode 100644 index 00000000..ceb76889 --- /dev/null +++ b/device/chip/tlb_manager.cpp @@ -0,0 +1,43 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "chip/tlb_manager.h" + +#include "common/logger.hpp" +#include "device/types/tlb.h" +#include "umd/device/tt_device/tt_device.h" + +namespace tt::umd { + +TLBManager::TLBManager(TTDevice* tt_device) : tt_device_(tt_device) {} + +void TLBManager::configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { + log_assert( + ordering == tlb_data::Strict || ordering == tlb_data::Posted || ordering == tlb_data::Relaxed, + "Invalid ordering specified in Cluster::configure_tlb"); + if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) { + tlb_config_map.insert({logical_device_id, {}}); + map_core_to_tlb_per_chip.insert({logical_device_id, {}}); + } + log_debug( + LogSiliconDriver, + "Configuring TLB for chip: {} core: {} tlb_index: {} address: {} ordering: {}", + logical_device_id, + core.str(), + tlb_index, + address, + ordering); + log_assert( + tlb_config_map.at(logical_device_id).find(tlb_index) == tlb_config_map.at(logical_device_id).end(), + "TLB index already configured {}", + tlb_index); + + tt_device_->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation.at(logical_device_id), ordering); + auto tlb_size = std::get<1>(tt_device_->get_architecture_implementation()->describe_tlb(tlb_index).value()); + tlb_config_map.at(logical_device_id).insert({tlb_index, (address / tlb_size) * tlb_size}); + map_core_to_tlb_per_chip.at(logical_device_id).insert({core, tlb_index}); +} +}; // namespace tt::umd diff --git a/device/chip/tlb_manager.h b/device/chip/tlb_manager.h new file mode 100644 index 00000000..ab9824f0 --- /dev/null +++ b/device/chip/tlb_manager.h @@ -0,0 +1,25 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include "umd/device/tt_xy_pair.h" + +namespace tt::umd { + +class TTDevice; + +class TLBManager { +public: + TLBManager(TTDevice* tt_device); + void configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering); + +private: + TTDevice* tt_device_; + std::unordered_map tlb_config_map_; + std::unordered_map map_core_to_tlb_; +}; + +} // namespace tt::umd diff --git a/device/cluster.cpp b/device/cluster.cpp index a592fe67..758929ce 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -1368,32 +1368,7 @@ tlb_configuration Cluster::get_tlb_configuration(const chip_id_t chip, CoreCoord void Cluster::configure_tlb( chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { - log_assert( - ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, - "Invalid ordering specified in Cluster::configure_tlb"); - if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) { - tlb_config_map.insert({logical_device_id, {}}); - map_core_to_tlb_per_chip.insert({logical_device_id, {}}); - } - log_debug( - LogSiliconDriver, - "Configuring TLB for chip: {} core: {} tlb_index: {} address: {} ordering: {}", - logical_device_id, - core.str(), - tlb_index, - address, - ordering); - log_assert( - tlb_config_map.at(logical_device_id).find(tlb_index) == tlb_config_map.at(logical_device_id).end(), - "TLB index already configured {}", - tlb_index); - - TTDevice* tt_device = get_tt_device(logical_device_id); - tt_device->set_dynamic_tlb( - tlb_index, harvested_coord_translation.at(logical_device_id).at(core), address, ordering); - uint64_t tlb_size = tt_device->get_architecture_implementation()->get_tlb_configuration(tlb_index).size; - tlb_config_map.at(logical_device_id).insert({tlb_index, (address / tlb_size) * tlb_size}); - map_core_to_tlb_per_chip.at(logical_device_id).insert({core, tlb_index}); + chips_.at(logical_device_id)->configure_tlb(core, tlb_index, address, ordering); } void Cluster::configure_tlb( From 42d5e584a45f241dcaf2bcc6af0cbf0442b4d685 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 18 Dec 2024 10:04:58 +0000 Subject: [PATCH 02/17] wip2 --- device/api/umd/device/chip/local_chip.h | 1 + device/api/umd/device/cluster.h | 2 -- device/chip/local_chip.cpp | 20 +++++++++++++++- device/chip/tlb_manager.cpp | 32 ++++++++++++++++--------- device/chip/tlb_manager.h | 13 ++++++++-- device/cluster.cpp | 21 ++++++---------- 6 files changed, 59 insertions(+), 30 deletions(-) diff --git a/device/api/umd/device/chip/local_chip.h b/device/api/umd/device/chip/local_chip.h index 2f06ccd5..b5b82773 100644 --- a/device/api/umd/device/chip/local_chip.h +++ b/device/api/umd/device/chip/local_chip.h @@ -17,6 +17,7 @@ class LocalChip : public Chip { LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id); TTDevice* get_tt_device() override; + TLBManager* get_tlb_manager() override; bool is_mmio_capable() const override; diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h index d9b30f4f..c0a05087 100644 --- a/device/api/umd/device/cluster.h +++ b/device/api/umd/device/cluster.h @@ -911,8 +911,6 @@ class Cluster : public tt_device { std::unordered_set eth_cores = {}; std::unordered_set dram_cores = {}; - std::unordered_map dynamic_tlb_config = {}; - std::unordered_map dynamic_tlb_ordering_modes = {}; std::map, std::unordered_map>>> bcast_header_cache = {}; bool perform_harvesting_on_sdesc = false; bool use_ethernet_ordered_writes = true; diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index 179145e3..3f3f796e 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -6,15 +6,33 @@ #include "umd/device/chip/local_chip.h" +#include "device/chip/tlb_manager.h" #include "umd/device/tt_device/tt_device.h" namespace tt::umd { LocalChip::LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id) : - Chip(soc_descriptor), tt_device_(TTDevice::create(pci_device_id)) {} + Chip(soc_descriptor), + tt_device_(TTDevice::create(pci_device_id)), + tlb_manager_(std::make_unique(tt_device_.get())) { + // Setup default dynamic tlbs. + tlb_manager_->set_dynamic_tlb( + "LARGE_READ_TLB", tt_device_->get_architecture_implementation()->get_mem_large_read_tlb()); + tlb_manager_->set_dynamic_tlb_ordering("LARGE_READ_TLB", tlb_data::Relaxed); + tlb_manager_->set_dynamic_tlb( + "LARGE_WRITE_TLB", tt_device_->get_architecture_implementation()->get_mem_large_write_tlb()); + tlb_manager_->set_dynamic_tlb_ordering("LARGE_WRITE_TLB", tlb_data::Relaxed); + tlb_manager_->set_dynamic_tlb("REG_TLB", tt_device_->get_architecture_implementation()->get_reg_tlb()); + tlb_manager_->set_dynamic_tlb_ordering("REG_TLB", tlb_data::Relaxed); + tlb_manager_->set_dynamic_tlb( + "SMALL_READ_WRITE_TLB", tt_device_->get_architecture_implementation()->get_small_read_write_tlb()); + tlb_manager_->set_dynamic_tlb_ordering("SMALL_READ_WRITE_TLB", tlb_data::Relaxed); +} TTDevice* LocalChip::get_tt_device() { return tt_device_.get(); } +TLBManager* LocalChip::get_tlb_manager() { return tlb_manager_.get(); } + bool LocalChip::is_mmio_capable() const { return true; } } // namespace tt::umd diff --git a/device/chip/tlb_manager.cpp b/device/chip/tlb_manager.cpp index ceb76889..8b4e655b 100644 --- a/device/chip/tlb_manager.cpp +++ b/device/chip/tlb_manager.cpp @@ -18,10 +18,6 @@ void TLBManager::configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t addr log_assert( ordering == tlb_data::Strict || ordering == tlb_data::Posted || ordering == tlb_data::Relaxed, "Invalid ordering specified in Cluster::configure_tlb"); - if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) { - tlb_config_map.insert({logical_device_id, {}}); - map_core_to_tlb_per_chip.insert({logical_device_id, {}}); - } log_debug( LogSiliconDriver, "Configuring TLB for chip: {} core: {} tlb_index: {} address: {} ordering: {}", @@ -30,14 +26,28 @@ void TLBManager::configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t addr tlb_index, address, ordering); - log_assert( - tlb_config_map.at(logical_device_id).find(tlb_index) == tlb_config_map.at(logical_device_id).end(), - "TLB index already configured {}", - tlb_index); + log_assert(tlb_config_map_.find(tlb_index) == tlb_config_map_.end(), "TLB index already configured {}", tlb_index); - tt_device_->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation.at(logical_device_id), ordering); + tt_device_->set_dynamic_tlb(tlb_index, core, address, ordering); auto tlb_size = std::get<1>(tt_device_->get_architecture_implementation()->describe_tlb(tlb_index).value()); - tlb_config_map.at(logical_device_id).insert({tlb_index, (address / tlb_size) * tlb_size}); - map_core_to_tlb_per_chip.at(logical_device_id).insert({core, tlb_index}); + tlb_config_map_.insert({tlb_index, (address / tlb_size) * tlb_size}); + map_core_to_tlb_.insert({core, tlb_index}); +} + +void TLBManager::set_dynamic_tlb(std::string fallback_tlb_name, int32_t tlb_index) { + log_assert( + dynamic_tlb_config_.find(fallback_tlb_name) == dynamic_tlb_config_.end(), + "Dynamic TLB already configured for {}", + fallback_tlb_name); + dynamic_tlb_config_.insert({fallback_tlb_name, tlb_index}); +} + +void TLBManager::set_dynamic_tlb_ordering(std::string fallback_tlb_name, uint64_t ordering) { + log_assert( + dynamic_tlb_config_.find(fallback_tlb_name) != dynamic_tlb_config_.end(), + "Dynamic TLB not configured {}", + fallback_tlb_name); + + dynamic_tlb_ordering_modes_[fallback_tlb_name] = ordering; } }; // namespace tt::umd diff --git a/device/chip/tlb_manager.h b/device/chip/tlb_manager.h index ab9824f0..1f17336d 100644 --- a/device/chip/tlb_manager.h +++ b/device/chip/tlb_manager.h @@ -14,12 +14,21 @@ class TTDevice; class TLBManager { public: TLBManager(TTDevice* tt_device); + void configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering); -private: - TTDevice* tt_device_; + void set_dynamic_tlb(std::string fallback_tlb_name, int32_t tlb_index); + void set_dynamic_tlb_ordering(std::string fallback_tlb_name, uint64_t ordering); + + // TODO: the following members will be moved to private once enough stuff is moved out of cluster. std::unordered_map tlb_config_map_; std::unordered_map map_core_to_tlb_; + + std::unordered_map dynamic_tlb_config_; + std::unordered_map dynamic_tlb_ordering_modes_; + +private: + TTDevice* tt_device_; }; } // namespace tt::umd diff --git a/device/cluster.cpp b/device/cluster.cpp index 758929ce..08f31b60 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -36,6 +36,7 @@ #include #include +#include "device/chip/tlb_manager.h" #include "api/umd/device/tt_core_coordinates.h" #include "logger.hpp" #include "umd/device/architecture_implementation.h" @@ -178,6 +179,7 @@ bool Cluster::is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_ return address_in_tlb_space(address, size_in_bytes, tlb_index, tlb_description.size, target.chip); } +void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm) { void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm) { // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here // (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm @@ -191,7 +193,7 @@ void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanu std::string mutex_name = ""; // Initialize Dynamic TLB mutexes - for (auto& tlb : dynamic_tlb_config) { + for (auto& tlb : chips_.at(logical_device_id)->get_tlb_manager()->dynamic_tlb_config_) { mutex_name = tlb.first + std::to_string(logical_device_id); if (cleanup_mutexes_in_shm) { named_mutex::remove(mutex_name.c_str()); @@ -266,6 +268,7 @@ void Cluster::create_device( pci_device->get_device_num(), pci_device->revision_id); + // TODO: This will be moved to a dedicated Locking class. initialize_interprocess_mutexes(logical_device_id, clean_system_resources); // MT: Initial BH - hugepages will fail init @@ -330,18 +333,6 @@ void Cluster::construct_cluster( perform_harvesting_on_sdesc = perform_harvesting; - // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and - // writes. - auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); - dynamic_tlb_config["LARGE_WRITE_TLB"] = architecture_implementation->get_mem_large_write_tlb(); - dynamic_tlb_config["REG_TLB"] = architecture_implementation->get_reg_tlb(); - dynamic_tlb_config["SMALL_READ_WRITE_TLB"] = architecture_implementation->get_small_read_write_tlb(); - - // All dynamic TLBs use Relaxed Ordering by default - for (const auto& tlb : dynamic_tlb_config) { - dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); - } create_device(local_chip_ids_, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources); // MT: Initial BH - Disable dependency to ethernet firmware @@ -1368,7 +1359,9 @@ tlb_configuration Cluster::get_tlb_configuration(const chip_id_t chip, CoreCoord void Cluster::configure_tlb( chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { - chips_.at(logical_device_id)->configure_tlb(core, tlb_index, address, ordering); + chips_.at(logical_device_id) + ->get_tlb_manager() + ->configure_tlb(harvested_coord_translation.at(logical_device_id).at(core), tlb_index, address, ordering); } void Cluster::configure_tlb( From e1d1de8822fdccecd79ef38b4918fb5af853adc5 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 18 Dec 2024 11:36:43 +0000 Subject: [PATCH 03/17] wip3 --- device/api/umd/device/cluster.h | 7 +- device/api/umd/device/tt_io.hpp | 2 +- device/chip/tlb_manager.cpp | 48 ++++++++++ device/chip/tlb_manager.h | 10 +++ device/cluster.cpp | 153 ++++++++++---------------------- 5 files changed, 109 insertions(+), 111 deletions(-) diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h index c0a05087..a363d018 100644 --- a/device/api/umd/device/cluster.h +++ b/device/api/umd/device/cluster.h @@ -619,6 +619,7 @@ class Cluster : public tt_device { // TODO: This should be accessible through public API, probably to be moved to tt_device. PCIDevice* get_pci_device(int device_id) const; TTDevice* get_tt_device(chip_id_t device_id) const; + TLBManager* get_tlb_manager(chip_id_t device_id) const; const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const; // Existing API we want to remove. UMD is transitioning to use CoreCoord instead of tt_xy_pair. @@ -836,12 +837,6 @@ class Cluster : public tt_device { uint32_t* return_3 = nullptr, uint32_t* return_4 = nullptr); - // TODO: These will be moved to a dedicated class for TLB management - bool address_in_tlb_space( - uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); - bool is_tlb_mapped(tt_cxy_pair target); - bool is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes); - std::shared_ptr get_mutex(const std::string& tlb_name, int logical_device_id); virtual uint32_t get_harvested_noc_rows_for_chip( int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips diff --git a/device/api/umd/device/tt_io.hpp b/device/api/umd/device/tt_io.hpp index 174903cb..99a7f4d8 100644 --- a/device/api/umd/device/tt_io.hpp +++ b/device/api/umd/device/tt_io.hpp @@ -23,7 +23,7 @@ class Cluster; * It is the caller's responsibility to manage the lifetime of Writer objects. */ class Writer { - friend class tt::umd::Cluster; + friend class tt::umd::TLBManager; public: /** diff --git a/device/chip/tlb_manager.cpp b/device/chip/tlb_manager.cpp index 8b4e655b..63c80dc7 100644 --- a/device/chip/tlb_manager.cpp +++ b/device/chip/tlb_manager.cpp @@ -9,6 +9,7 @@ #include "common/logger.hpp" #include "device/types/tlb.h" #include "umd/device/tt_device/tt_device.h" +#include "umd/device/tt_io.hpp" namespace tt::umd { @@ -43,6 +44,12 @@ void TLBManager::set_dynamic_tlb(std::string fallback_tlb_name, int32_t tlb_inde } void TLBManager::set_dynamic_tlb_ordering(std::string fallback_tlb_name, uint64_t ordering) { + log_assert( + ordering == tlb_data::Strict || ordering == tlb_data::Posted || ordering == tlb_data::Relaxed, + "Invalid ordering specified in set_dynamic_tlb_ordering."); + log_assert( + fallback_tlb_name != "LARGE_READ_TLB" && fallback_tlb_name != "LARGE_WRITE_TLB", + "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); log_assert( dynamic_tlb_config_.find(fallback_tlb_name) != dynamic_tlb_config_.end(), "Dynamic TLB not configured {}", @@ -50,4 +57,45 @@ void TLBManager::set_dynamic_tlb_ordering(std::string fallback_tlb_name, uint64_ dynamic_tlb_ordering_modes_[fallback_tlb_name] = ordering; } + +bool TLBManager::address_in_tlb_space(uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size) { + if (tlb_config_map_.find(tlb_index) != tlb_config_map_.end()) { + auto mapped_address = tlb_config_map_.at(tlb_index); + return address >= mapped_address && (address + size_in_bytes <= mapped_address + tlb_size); + } + return false; +} + +bool TLBManager::is_tlb_mapped(tt_xy_pair core) { return map_core_to_tlb_.find(core) != map_core_to_tlb_.end(); } + +bool TLBManager::is_tlb_mapped(tt_xy_pair core, uint64_t address, uint32_t size_in_bytes) { + if (!is_tlb_mapped(core)) { + return false; + } + + int32_t tlb_index = map_core_to_tlb_.at(core); + auto tlb_description = tt_device_->get_architecture_implementation()->describe_tlb(tlb_index); + + return tlb_description.has_value() && + address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_description.value())); +} + +tt::Writer TLBManager::get_static_tlb_writer(tt_xy_pair core) { + if (!is_tlb_mapped(core)) { + throw std::runtime_error(fmt::format("TLBs not initialized for core: {}", core.str())); + } + + if (!tt_device_->get_pci_device()->bar0_wc) { + throw std::runtime_error("No write-combined mapping for BAR0"); + } + + auto tlb_index = map_core_to_tlb_.at(core); + auto tlb_data = tt_device_->get_architecture_implementation()->describe_tlb(tlb_index); + + auto [tlb_offset, tlb_size] = tlb_data.value(); + auto* base = reinterpret_cast(tt_device_->get_pci_device()->bar0_wc); + + return tt::Writer(base + tlb_offset, tlb_size); +} + }; // namespace tt::umd diff --git a/device/chip/tlb_manager.h b/device/chip/tlb_manager.h index 1f17336d..07e0dfca 100644 --- a/device/chip/tlb_manager.h +++ b/device/chip/tlb_manager.h @@ -7,6 +7,10 @@ #include "umd/device/tt_xy_pair.h" +namespace tt { +class Writer; +} + namespace tt::umd { class TTDevice; @@ -20,6 +24,12 @@ class TLBManager { void set_dynamic_tlb(std::string fallback_tlb_name, int32_t tlb_index); void set_dynamic_tlb_ordering(std::string fallback_tlb_name, uint64_t ordering); + bool address_in_tlb_space(uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size); + bool is_tlb_mapped(tt_xy_pair core); + bool is_tlb_mapped(tt_xy_pair core, uint64_t address, uint32_t size_in_bytes); + + tt::Writer TLBManager::get_static_tlb_writer(tt_xy_pair core); + // TODO: the following members will be moved to private once enough stuff is moved out of cluster. std::unordered_map tlb_config_map_; std::unordered_map map_core_to_tlb_; diff --git a/device/cluster.cpp b/device/cluster.cpp index 08f31b60..658e0d20 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -144,42 +144,6 @@ std::unordered_map Cluster::get_virtual_soc_descrip return soc_descs; } -bool Cluster::address_in_tlb_space( - uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { - const auto& tlb_map = tlb_config_map.at(chip); - const auto it = tlb_map.find(tlb_index); - if (it != tlb_map.end()) { - auto mapped_address = it->second; - return address >= mapped_address && (address + size_in_bytes <= mapped_address + tlb_size); - } - return false; -} - -bool Cluster::is_tlb_mapped(tt_cxy_pair target) { - if (map_core_to_tlb_per_chip.find(target.chip) == map_core_to_tlb_per_chip.end()) { - return false; - } - - auto& map_core_to_tlb = map_core_to_tlb_per_chip.at(target.chip); - tt_xy_pair target_core = tt_xy_pair(target.x, target.y); - - return map_core_to_tlb.find(target_core) != map_core_to_tlb.end(); -} - -bool Cluster::is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes) { - if (!is_tlb_mapped(target)) { - return false; - } - - auto* dev = get_tt_device(target.chip); - - int32_t tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)); - tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration(tlb_index); - - return address_in_tlb_space(address, size_in_bytes, tlb_index, tlb_description.size, target.chip); -} - -void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm) { void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm) { // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here // (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm @@ -193,7 +157,7 @@ void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanu std::string mutex_name = ""; // Initialize Dynamic TLB mutexes - for (auto& tlb : chips_.at(logical_device_id)->get_tlb_manager()->dynamic_tlb_config_) { + for (auto& tlb : get_tlb_manager(logical_device_id)->dynamic_tlb_config_) { mutex_name = tlb.first + std::to_string(logical_device_id); if (cleanup_mutexes_in_shm) { named_mutex::remove(mutex_name.c_str()); @@ -1066,24 +1030,7 @@ std::function Cluster::get_fast_pcie_s } tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) { - if (!cluster_desc->is_chip_mmio_capable(target.chip)) { - throw std::runtime_error(fmt::format("Target not in MMIO chip: {}", target.str())); - } - - if (!is_tlb_mapped(target)) { - throw std::runtime_error(fmt::format("TLBs not initialized for core: {}", target.str())); - } - - auto* dev = get_tt_device(target.chip); - if (!dev->get_pci_device()->bar0_wc) { - throw std::runtime_error("No write-combined mapping for BAR0"); - } - - auto tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)); - tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration(tlb_index); - - auto* base = reinterpret_cast(dev->get_pci_device()->bar0_wc); - return tt::Writer(base + tlb_description.tlb_offset, tlb_description.size); + return get_tlb_manager(target.chip)->get_static_tlb_writer({target.x, target.y}); } tt::Writer Cluster::get_static_tlb_writer(const chip_id_t chip, const CoreCoord target) { @@ -1110,7 +1057,7 @@ void Cluster::write_device_memory( size_in_bytes, small_access); - if (is_tlb_mapped(target, address, size_in_bytes)) { + if (get_tlb_manager(target.chip)->is_tlb_mapped({target.x, target.y}, address, size_in_bytes)) { tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration( map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y))); if (dev->get_pci_device()->bar4_wc != nullptr && tlb_description.size == BH_4GB_TLB_SIZE) { @@ -1124,7 +1071,7 @@ void Cluster::write_device_memory( dev->write_block(tlb_description.tlb_offset + address % tlb_description.size, size_in_bytes, buffer_addr); } } else { - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(target.chip)->dynamic_tlb_config_.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, target.chip)); while (size_in_bytes > 0) { @@ -1132,7 +1079,7 @@ void Cluster::write_device_memory( tlb_index, harvested_coord_translation.at(target.chip).at(target), address, - dynamic_tlb_ordering_modes.at(fallback_tlb)); + get_tlb_manager(target.chip)->dynamic_tlb_ordering_modes_.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->write_block(mapped_address, transfer_size, buffer_addr); @@ -1178,7 +1125,7 @@ void Cluster::read_device_memory( tlb_description.tlb_offset, tlb_description.size); } else { - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(target.chip)->dynamic_tlb_config_.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, target.chip)); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); while (size_in_bytes > 0) { @@ -1186,7 +1133,7 @@ void Cluster::read_device_memory( tlb_index, harvested_coord_translation.at(target.chip).at(target), address, - dynamic_tlb_ordering_modes.at(fallback_tlb)); + get_tlb_manager(target.chip)->dynamic_tlb_ordering_modes_.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->read_block(mapped_address, transfer_size, buffer_addr); @@ -1330,13 +1277,10 @@ Cluster::~Cluster() { cleanup_shared_host_state(); cluster_desc.reset(); - dynamic_tlb_config.clear(); - tlb_config_map.clear(); - dynamic_tlb_ordering_modes.clear(); } std::optional> Cluster::get_tlb_data_from_target(const tt_cxy_pair& target) { - auto tlb_configuration = get_tlb_configuration(target); + tlb_configuration tlb_configuration = get_tlb_configuration(target); return std::tuple((uint32_t)tlb_configuration.tlb_offset, (uint32_t)tlb_configuration.size); } @@ -1359,8 +1303,7 @@ tlb_configuration Cluster::get_tlb_configuration(const chip_id_t chip, CoreCoord void Cluster::configure_tlb( chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { - chips_.at(logical_device_id) - ->get_tlb_manager() + get_tlb_manager(logical_device_id) ->configure_tlb(harvested_coord_translation.at(logical_device_id).at(core), tlb_index, address, ordering); } @@ -1371,16 +1314,9 @@ void Cluster::configure_tlb( } void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) { - log_assert( - ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, - "Invalid ordering specified in Cluster::configure_tlb."); - log_assert( - dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), - "Invalid TLB specified in Cluster::set_fallback_tlb_ordering_mode."); - log_assert( - fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", - "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); - dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; + for (auto& chip_id : local_chip_ids_) { + get_tlb_manager(chip_id)->set_dynamic_tlb_ordering(fallback_tlb, ordering); + } } // TODO: this is in the wrong place, it should be in the TTDevice. @@ -1614,8 +1550,8 @@ int Cluster::iatu_configure_peer_region( // TODO: stop doing this. It's related to HUGEPAGE_CHANNEL_3_SIZE_LIMIT. if (peer_region_id == 3) { - region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address - // space with the correct start offset + region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 + // address space with the correct start offset } TTDevice* tt_device = get_tt_device(logical_device_id); @@ -1726,7 +1662,7 @@ void* Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, u } } -// Wrapper for throwing more helpful exception when not-enabled pci intf is accessed. +// Wrapper for throwing a more helpful exception when trying to access non pci enabled interface. inline TTDevice* Cluster::get_tt_device(chip_id_t device_id) const { log_assert(chips_.find(device_id) != chips_.end(), "Device id {} not found in cluster.", device_id); auto tt_device = chips_.at(device_id)->get_tt_device(); @@ -1734,6 +1670,14 @@ inline TTDevice* Cluster::get_tt_device(chip_id_t device_id) const { return tt_device; } +// Wrapper for throwing a more helpful exception when trying to access non pci enabled interface. +inline TLBManager* Cluster::get_tlb_manager(chip_id_t device_id) const { + log_assert(chips_.find(device_id) != chips_.end(), "Device id {} not found in cluster.", device_id); + auto tlb_manager = chips_.at(device_id)->get_tlb_manager(); + log_assert(tlb_manager != nullptr, "TLBManager not found for device: {}", device_id); + return tlb_manager; +} + std::shared_ptr Cluster::get_mutex( const std::string& tlb_name, int logical_device_id) { std::string mutex_name = tlb_name + std::to_string(logical_device_id); @@ -1779,16 +1723,17 @@ bool Cluster::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) { * writes/reads to/from those wormhole chips that aren't memory mapped or directly host connected. * To get the data to or from those other chips, there is a memory transfer protocol - initiated on * the host side but carried out by any number of the ethernet cores (the ethernet core pool is dictated - * by `this->NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS`) on the MMIO chips (e.g. typically just the one chip in a galaxy). + * by `this->NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS`) on the MMIO chips (e.g. typically just the one chip in a + * galaxy). * * There is a command queue structure in ethernet core FW to accept these read/write commands. However, there is no * atomic increment (from host side) for the write pointers of these queues, nor is there any sort of other hardware * mutual exclusion (as of WH) from host side when populating commands into the queue (as in when the host pushes a * write command into the ethernet core's queue). * - * Therefore, any of these non_mmio commands from host side need to be synchronized so they don't accidentally corrupt - * each other. The finest granularity possible to synchronize on would be the command slot and wrptr (per core), - * but wrptr updates also need to be coordinated: + * Therefore, any of these non_mmio commands from host side need to be synchronized so they don't accidentally + * corrupt each other. The finest granularity possible to synchronize on would be the command slot and wrptr (per + * core), but wrptr updates also need to be coordinated: * - you can't increment wrptr unless you are writing to the next index and your write is complete * - if two threads could guarantee separate command slots, they'd need to order their wrptr updates from lowest to * highest and based on completion of command writes. @@ -1924,8 +1869,8 @@ void Cluster::write_to_non_mmio_device( uint32_t alignment_mask = sizeof(uint32_t) - 1; block_size = (block_size + alignment_mask) & ~alignment_mask; } - // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size - // in the last block + // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < + // block_size in the last block uint64_t transfer_size = std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied // Use block mode for broadcast @@ -2064,9 +2009,9 @@ void Cluster::write_to_non_mmio_device( } /* - * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core - * (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring - * the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above + * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet + * core (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to + * acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above */ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) { using data_word_t = uint32_t; @@ -2435,14 +2380,14 @@ std::unordered_map>>& Cluster::get_ether // Rack byte to be set in header uint32_t rack_byte = eth_coords.rack % 4; // 1st level grouping: Group broadcasts based on the MMIO chip they must go through - // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each - // set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to specific - // chips through their closest MMIO chip All other shelves are fully connected galaxy grids. These are - // connected to all MMIO devices. Use any (or the first) MMIO device in the list. + // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with + // each set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to + // specific chips through their closest MMIO chip All other shelves are fully connected galaxy + // grids. These are connected to all MMIO devices. Use any (or the first) MMIO device in the list. chip_id_t closest_mmio_chip = 0; if (eth_coords.rack == 0 && eth_coords.shelf == 0) { - // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its - // own MMIO counterpart. + // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through + // its own MMIO counterpart. closest_mmio_chip = cluster_desc->get_closest_mmio_capable_chip(chip); } else { // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are @@ -2453,8 +2398,8 @@ std::unordered_map>>& Cluster::get_ether broadcast_mask_for_target_chips_per_group.end()) { broadcast_mask_for_target_chips_per_group.insert({closest_mmio_chip, {}}); } - // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves - // that contain this physical id. + // For each target physical chip id (local to a shelf), generate headers based on all racks and + // shelves that contain this physical id. if (broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { // Target seen for the first time. @@ -2472,8 +2417,8 @@ std::unordered_map>>& Cluster::get_ether } } } - // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The - // number of groups after this step represent the final set of broadcast grids. + // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. + // The number of groups after this step represent the final set of broadcast grids. for (auto& mmio_group : broadcast_mask_for_target_chips_per_group) { for (auto& chip : mmio_group.second) { // Generate a hash for this MMIO Chip + Rack + Shelf group @@ -2521,10 +2466,10 @@ void Cluster::pcie_broadcast_write( const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) { - // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet - // Broadcast for WH. + // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use + // Ethernet Broadcast for WH. TTDevice* tt_device = get_tt_device(chip); - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(chip)->dynamic_tlb_config_.at(fallback_tlb); const uint8_t* buffer_addr = static_cast(mem_ptr); const scoped_lock lock(*get_mutex(fallback_tlb, chip)); while (size_in_bytes > 0) { @@ -2533,7 +2478,7 @@ void Cluster::pcie_broadcast_write( addr, harvested_coord_translation.at(chip).at(start), harvested_coord_translation.at(chip).at(end), - dynamic_tlb_ordering_modes.at(fallback_tlb)); + get_tlb_manager(chip)->dynamic_tlb_ordering_modes_.at(fallback_tlb)); uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); tt_device->write_block(mapped_address, transfer_size, buffer_addr); @@ -3039,7 +2984,7 @@ void Cluster::read_mmio_device_register( void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { TTDevice* tt_device = get_tt_device(core.chip); - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(core.chip)->dynamic_tlb_config_.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, core.chip)); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); @@ -3059,7 +3004,7 @@ void Cluster::write_mmio_device_register( const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { TTDevice* tt_device = get_tt_device(core.chip); - const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); + const auto tlb_index = get_tlb_manager(core.chip)->dynamic_tlb_config_.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, core.chip)); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); From 09886ef321f2fe57b7bdd3b44cd4ff6fba05ea73 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 18 Dec 2024 14:38:15 +0000 Subject: [PATCH 04/17] wip4 --- device/chip/local_chip.cpp | 2 +- device/chip/tlb_manager.cpp | 21 +++++++++++++-------- device/chip/tlb_manager.h | 3 ++- device/cluster.cpp | 22 ++++++---------------- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index 3f3f796e..8edd293e 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -6,7 +6,7 @@ #include "umd/device/chip/local_chip.h" -#include "device/chip/tlb_manager.h" +#include "chip/tlb_manager.h" #include "umd/device/tt_device/tt_device.h" namespace tt::umd { diff --git a/device/chip/tlb_manager.cpp b/device/chip/tlb_manager.cpp index 63c80dc7..a41e333b 100644 --- a/device/chip/tlb_manager.cpp +++ b/device/chip/tlb_manager.cpp @@ -7,9 +7,9 @@ #include "chip/tlb_manager.h" #include "common/logger.hpp" -#include "device/types/tlb.h" #include "umd/device/tt_device/tt_device.h" #include "umd/device/tt_io.hpp" +#include "umd/device/types/tlb.h" namespace tt::umd { @@ -30,7 +30,7 @@ void TLBManager::configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t addr log_assert(tlb_config_map_.find(tlb_index) == tlb_config_map_.end(), "TLB index already configured {}", tlb_index); tt_device_->set_dynamic_tlb(tlb_index, core, address, ordering); - auto tlb_size = std::get<1>(tt_device_->get_architecture_implementation()->describe_tlb(tlb_index).value()); + auto tlb_size = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index).size; tlb_config_map_.insert({tlb_index, (address / tlb_size) * tlb_size}); map_core_to_tlb_.insert({core, tlb_index}); } @@ -74,10 +74,9 @@ bool TLBManager::is_tlb_mapped(tt_xy_pair core, uint64_t address, uint32_t size_ } int32_t tlb_index = map_core_to_tlb_.at(core); - auto tlb_description = tt_device_->get_architecture_implementation()->describe_tlb(tlb_index); + auto tlb_description = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index); - return tlb_description.has_value() && - address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_description.value())); + return address_in_tlb_space(address, size_in_bytes, tlb_index, tlb_description.size); } tt::Writer TLBManager::get_static_tlb_writer(tt_xy_pair core) { @@ -90,12 +89,18 @@ tt::Writer TLBManager::get_static_tlb_writer(tt_xy_pair core) { } auto tlb_index = map_core_to_tlb_.at(core); - auto tlb_data = tt_device_->get_architecture_implementation()->describe_tlb(tlb_index); + auto tlb_data = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index); - auto [tlb_offset, tlb_size] = tlb_data.value(); auto* base = reinterpret_cast(tt_device_->get_pci_device()->bar0_wc); - return tt::Writer(base + tlb_offset, tlb_size); + return tt::Writer(base + tlb_data.tlb_offset, tlb_data.size); +} + +tlb_configuration TLBManager::get_tlb_configuration(tt_xy_pair core) { + log_assert(is_tlb_mapped(core), "TLB not mapped for core: {}", core.str()); + + int tlb_index = map_core_to_tlb_.at(core); + return tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index); } }; // namespace tt::umd diff --git a/device/chip/tlb_manager.h b/device/chip/tlb_manager.h index 07e0dfca..b279eb0b 100644 --- a/device/chip/tlb_manager.h +++ b/device/chip/tlb_manager.h @@ -28,7 +28,8 @@ class TLBManager { bool is_tlb_mapped(tt_xy_pair core); bool is_tlb_mapped(tt_xy_pair core, uint64_t address, uint32_t size_in_bytes); - tt::Writer TLBManager::get_static_tlb_writer(tt_xy_pair core); + tt::Writer get_static_tlb_writer(tt_xy_pair core); + tlb_configuration get_tlb_configuration(tt_xy_pair core); // TODO: the following members will be moved to private once enough stuff is moved out of cluster. std::unordered_map tlb_config_map_; diff --git a/device/cluster.cpp b/device/cluster.cpp index 658e0d20..6f72dff2 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -36,7 +36,7 @@ #include #include -#include "device/chip/tlb_manager.h" +#include "chip/tlb_manager.h" #include "api/umd/device/tt_core_coordinates.h" #include "logger.hpp" #include "umd/device/architecture_implementation.h" @@ -1058,8 +1058,7 @@ void Cluster::write_device_memory( small_access); if (get_tlb_manager(target.chip)->is_tlb_mapped({target.x, target.y}, address, size_in_bytes)) { - tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration( - map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y))); + auto tlb_description = get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); if (dev->get_pci_device()->bar4_wc != nullptr && tlb_description.size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset // to which we write so write_block knows it needs to target BAR4 @@ -1106,9 +1105,8 @@ void Cluster::read_device_memory( log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value()); - if (is_tlb_mapped(target, address, size_in_bytes)) { - tlb_configuration tlb_description = dev->get_architecture_implementation()->get_tlb_configuration( - map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y))); + if (get_tlb_manager(target.chip)->is_tlb_mapped({target.x, target.y}, address, size_in_bytes)) { + auto tlb_description = get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); if (dev->get_pci_device()->bar4_wc != nullptr && tlb_description.size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset // from which we read so read_block knows it needs to target BAR4 @@ -1279,16 +1277,8 @@ Cluster::~Cluster() { cluster_desc.reset(); } -std::optional> Cluster::get_tlb_data_from_target(const tt_cxy_pair& target) { - tlb_configuration tlb_configuration = get_tlb_configuration(target); - return std::tuple((uint32_t)tlb_configuration.tlb_offset, (uint32_t)tlb_configuration.size); -} - -tlb_configuration Cluster::get_tlb_configuration(const tt_cxy_pair& target) { - log_assert(is_tlb_mapped(target), "TLB not mapped for core: {}", target.str()); - - int tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)); - return get_tt_device(target.chip)->get_architecture_implementation()->get_tlb_configuration(tlb_index); +tlb_configuration Cluster::get_tlb_data_from_target(const tt_cxy_pair& target) { + return get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); } std::optional> Cluster::get_tlb_data_from_target(const chip_id_t chip, CoreCoord core) { From 0832da2fbd56e9ae5872fcca14021c10035f6956 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 18 Dec 2024 14:51:22 +0000 Subject: [PATCH 05/17] should work --- device/CMakeLists.txt | 1 + device/chip/tlb_manager.cpp | 2 +- device/chip/tlb_manager.h | 1 + device/cluster.cpp | 9 +++++---- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt index 151afff0..8a314896 100644 --- a/device/CMakeLists.txt +++ b/device/CMakeLists.txt @@ -26,6 +26,7 @@ target_sources( chip/local_chip.cpp chip/mock_chip.cpp chip/remote_chip.cpp + chip/tlb_manager.cpp cluster.cpp coordinate_manager.cpp cpuset_lib.cpp diff --git a/device/chip/tlb_manager.cpp b/device/chip/tlb_manager.cpp index a41e333b..bcef26ec 100644 --- a/device/chip/tlb_manager.cpp +++ b/device/chip/tlb_manager.cpp @@ -6,7 +6,7 @@ #include "chip/tlb_manager.h" -#include "common/logger.hpp" +#include "logger.hpp" #include "umd/device/tt_device/tt_device.h" #include "umd/device/tt_io.hpp" #include "umd/device/types/tlb.h" diff --git a/device/chip/tlb_manager.h b/device/chip/tlb_manager.h index b279eb0b..d2ab47b2 100644 --- a/device/chip/tlb_manager.h +++ b/device/chip/tlb_manager.h @@ -6,6 +6,7 @@ #pragma once #include "umd/device/tt_xy_pair.h" +#include "umd/device/types/tlb.h" namespace tt { class Writer; diff --git a/device/cluster.cpp b/device/cluster.cpp index 6f72dff2..a3233303 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -426,20 +426,21 @@ void Cluster::construct_cluster( } } + auto any_architecture_implementation = get_tt_device(*local_chip_ids_.begin())->get_architecture_implementation(); // Default initialize l1_address_params based on detected arch - l1_address_params = architecture_implementation->get_l1_address_params(); + l1_address_params = any_architecture_implementation->get_l1_address_params(); // Default initialize dram_address_params. dram_address_params = {0u}; // Default initialize host_address_params based on detected arch - host_address_params = architecture_implementation->get_host_address_params(); + host_address_params = any_architecture_implementation->get_host_address_params(); // Default initialize eth_interface_params based on detected arch - eth_interface_params = architecture_implementation->get_eth_interface_params(); + eth_interface_params = any_architecture_implementation->get_eth_interface_params(); // Default initialize noc_params based on detected arch - noc_params = architecture_implementation->get_noc_params(); + noc_params = any_architecture_implementation->get_noc_params(); } std::unique_ptr Cluster::construct_chip_from_cluster( From 242d09e736f4b1db0e28b205d48c1570ec4330c7 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 18 Dec 2024 15:12:38 +0000 Subject: [PATCH 06/17] minor changes selfinitiated --- device/api/umd/device/chip/local_chip.h | 5 --- device/chip/local_chip.cpp | 16 ++++---- device/chip/tlb_manager.cpp | 6 +-- device/chip/tlb_manager.h | 4 +- device/cluster.cpp | 49 ++++++++++++------------- 5 files changed, 37 insertions(+), 43 deletions(-) diff --git a/device/api/umd/device/chip/local_chip.h b/device/api/umd/device/chip/local_chip.h index b5b82773..55b09a43 100644 --- a/device/api/umd/device/chip/local_chip.h +++ b/device/api/umd/device/chip/local_chip.h @@ -21,11 +21,6 @@ class LocalChip : public Chip { bool is_mmio_capable() const override; - // TLB related functions - // void configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering); - - // TODO: remove this function once write and read is moved to chip class. - private: std::unique_ptr tt_device_; std::unique_ptr tlb_manager_; diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index 8edd293e..15d605ce 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -16,17 +16,17 @@ LocalChip::LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id) : tt_device_(TTDevice::create(pci_device_id)), tlb_manager_(std::make_unique(tt_device_.get())) { // Setup default dynamic tlbs. - tlb_manager_->set_dynamic_tlb( + tlb_manager_->set_dynamic_tlb_config( "LARGE_READ_TLB", tt_device_->get_architecture_implementation()->get_mem_large_read_tlb()); - tlb_manager_->set_dynamic_tlb_ordering("LARGE_READ_TLB", tlb_data::Relaxed); - tlb_manager_->set_dynamic_tlb( + tlb_manager_->set_dynamic_tlb_config_ordering("LARGE_READ_TLB", tlb_data::Relaxed); + tlb_manager_->set_dynamic_tlb_config( "LARGE_WRITE_TLB", tt_device_->get_architecture_implementation()->get_mem_large_write_tlb()); - tlb_manager_->set_dynamic_tlb_ordering("LARGE_WRITE_TLB", tlb_data::Relaxed); - tlb_manager_->set_dynamic_tlb("REG_TLB", tt_device_->get_architecture_implementation()->get_reg_tlb()); - tlb_manager_->set_dynamic_tlb_ordering("REG_TLB", tlb_data::Relaxed); - tlb_manager_->set_dynamic_tlb( + tlb_manager_->set_dynamic_tlb_config_ordering("LARGE_WRITE_TLB", tlb_data::Relaxed); + tlb_manager_->set_dynamic_tlb_config("REG_TLB", tt_device_->get_architecture_implementation()->get_reg_tlb()); + tlb_manager_->set_dynamic_tlb_config_ordering("REG_TLB", tlb_data::Relaxed); + tlb_manager_->set_dynamic_tlb_config( "SMALL_READ_WRITE_TLB", tt_device_->get_architecture_implementation()->get_small_read_write_tlb()); - tlb_manager_->set_dynamic_tlb_ordering("SMALL_READ_WRITE_TLB", tlb_data::Relaxed); + tlb_manager_->set_dynamic_tlb_config_ordering("SMALL_READ_WRITE_TLB", tlb_data::Relaxed); } TTDevice* LocalChip::get_tt_device() { return tt_device_.get(); } diff --git a/device/chip/tlb_manager.cpp b/device/chip/tlb_manager.cpp index bcef26ec..1ff6df2b 100644 --- a/device/chip/tlb_manager.cpp +++ b/device/chip/tlb_manager.cpp @@ -35,7 +35,7 @@ void TLBManager::configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t addr map_core_to_tlb_.insert({core, tlb_index}); } -void TLBManager::set_dynamic_tlb(std::string fallback_tlb_name, int32_t tlb_index) { +void TLBManager::set_dynamic_tlb_config(std::string fallback_tlb_name, int32_t tlb_index) { log_assert( dynamic_tlb_config_.find(fallback_tlb_name) == dynamic_tlb_config_.end(), "Dynamic TLB already configured for {}", @@ -43,10 +43,10 @@ void TLBManager::set_dynamic_tlb(std::string fallback_tlb_name, int32_t tlb_inde dynamic_tlb_config_.insert({fallback_tlb_name, tlb_index}); } -void TLBManager::set_dynamic_tlb_ordering(std::string fallback_tlb_name, uint64_t ordering) { +void TLBManager::set_dynamic_tlb_config_ordering(std::string fallback_tlb_name, uint64_t ordering) { log_assert( ordering == tlb_data::Strict || ordering == tlb_data::Posted || ordering == tlb_data::Relaxed, - "Invalid ordering specified in set_dynamic_tlb_ordering."); + "Invalid ordering specified in set_dynamic_tlb_config_ordering."); log_assert( fallback_tlb_name != "LARGE_READ_TLB" && fallback_tlb_name != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); diff --git a/device/chip/tlb_manager.h b/device/chip/tlb_manager.h index d2ab47b2..b8b2342d 100644 --- a/device/chip/tlb_manager.h +++ b/device/chip/tlb_manager.h @@ -22,8 +22,8 @@ class TLBManager { void configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering); - void set_dynamic_tlb(std::string fallback_tlb_name, int32_t tlb_index); - void set_dynamic_tlb_ordering(std::string fallback_tlb_name, uint64_t ordering); + void set_dynamic_tlb_config(std::string fallback_tlb_name, int32_t tlb_index); + void set_dynamic_tlb_config_ordering(std::string fallback_tlb_name, uint64_t ordering); bool address_in_tlb_space(uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size); bool is_tlb_mapped(tt_xy_pair core); diff --git a/device/cluster.cpp b/device/cluster.cpp index a3233303..aeb0778b 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -1306,7 +1306,7 @@ void Cluster::configure_tlb( void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) { for (auto& chip_id : local_chip_ids_) { - get_tlb_manager(chip_id)->set_dynamic_tlb_ordering(fallback_tlb, ordering); + get_tlb_manager(chip_id)->set_dynamic_tlb_config_ordering(fallback_tlb, ordering); } } @@ -1541,8 +1541,8 @@ int Cluster::iatu_configure_peer_region( // TODO: stop doing this. It's related to HUGEPAGE_CHANNEL_3_SIZE_LIMIT. if (peer_region_id == 3) { - region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 - // address space with the correct start offset + region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address + // space with the correct start offset } TTDevice* tt_device = get_tt_device(logical_device_id); @@ -1714,17 +1714,16 @@ bool Cluster::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) { * writes/reads to/from those wormhole chips that aren't memory mapped or directly host connected. * To get the data to or from those other chips, there is a memory transfer protocol - initiated on * the host side but carried out by any number of the ethernet cores (the ethernet core pool is dictated - * by `this->NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS`) on the MMIO chips (e.g. typically just the one chip in a - * galaxy). + * by `this->NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS`) on the MMIO chips (e.g. typically just the one chip in a galaxy). * * There is a command queue structure in ethernet core FW to accept these read/write commands. However, there is no * atomic increment (from host side) for the write pointers of these queues, nor is there any sort of other hardware * mutual exclusion (as of WH) from host side when populating commands into the queue (as in when the host pushes a * write command into the ethernet core's queue). * - * Therefore, any of these non_mmio commands from host side need to be synchronized so they don't accidentally - * corrupt each other. The finest granularity possible to synchronize on would be the command slot and wrptr (per - * core), but wrptr updates also need to be coordinated: + * Therefore, any of these non_mmio commands from host side need to be synchronized so they don't accidentally corrupt + * each other. The finest granularity possible to synchronize on would be the command slot and wrptr (per core), + * but wrptr updates also need to be coordinated: * - you can't increment wrptr unless you are writing to the next index and your write is complete * - if two threads could guarantee separate command slots, they'd need to order their wrptr updates from lowest to * highest and based on completion of command writes. @@ -1860,8 +1859,8 @@ void Cluster::write_to_non_mmio_device( uint32_t alignment_mask = sizeof(uint32_t) - 1; block_size = (block_size + alignment_mask) & ~alignment_mask; } - // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < - // block_size in the last block + // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size + // in the last block uint64_t transfer_size = std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied // Use block mode for broadcast @@ -2000,9 +1999,9 @@ void Cluster::write_to_non_mmio_device( } /* - * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet - * core (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to - * acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above + * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core + * (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring + * the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above */ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) { using data_word_t = uint32_t; @@ -2371,14 +2370,14 @@ std::unordered_map>>& Cluster::get_ether // Rack byte to be set in header uint32_t rack_byte = eth_coords.rack % 4; // 1st level grouping: Group broadcasts based on the MMIO chip they must go through - // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with - // each set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to - // specific chips through their closest MMIO chip All other shelves are fully connected galaxy - // grids. These are connected to all MMIO devices. Use any (or the first) MMIO device in the list. + // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each + // set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to specific + // chips through their closest MMIO chip All other shelves are fully connected galaxy grids. These are + // connected to all MMIO devices. Use any (or the first) MMIO device in the list. chip_id_t closest_mmio_chip = 0; if (eth_coords.rack == 0 && eth_coords.shelf == 0) { - // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through - // its own MMIO counterpart. + // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its + // own MMIO counterpart. closest_mmio_chip = cluster_desc->get_closest_mmio_capable_chip(chip); } else { // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are @@ -2389,8 +2388,8 @@ std::unordered_map>>& Cluster::get_ether broadcast_mask_for_target_chips_per_group.end()) { broadcast_mask_for_target_chips_per_group.insert({closest_mmio_chip, {}}); } - // For each target physical chip id (local to a shelf), generate headers based on all racks and - // shelves that contain this physical id. + // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves + // that contain this physical id. if (broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { // Target seen for the first time. @@ -2408,8 +2407,8 @@ std::unordered_map>>& Cluster::get_ether } } } - // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. - // The number of groups after this step represent the final set of broadcast grids. + // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The + // number of groups after this step represent the final set of broadcast grids. for (auto& mmio_group : broadcast_mask_for_target_chips_per_group) { for (auto& chip : mmio_group.second) { // Generate a hash for this MMIO Chip + Rack + Shelf group @@ -2457,8 +2456,8 @@ void Cluster::pcie_broadcast_write( const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) { - // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use - // Ethernet Broadcast for WH. + // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet + // Broadcast for WH. TTDevice* tt_device = get_tt_device(chip); const auto tlb_index = get_tlb_manager(chip)->dynamic_tlb_config_.at(fallback_tlb); const uint8_t* buffer_addr = static_cast(mem_ptr); From 7a1ef2fb317f422777949ecfe7d854fa3cb79702 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Tue, 24 Dec 2024 14:05:21 +0000 Subject: [PATCH 07/17] moved tlbmanager --- device/CMakeLists.txt | 2 +- device/api/umd/device/chip/chip.h | 2 -- device/api/umd/device/chip/local_chip.h | 4 ---- device/api/umd/device/tt_device/tt_device.h | 5 +++++ device/chip/chip.cpp | 2 -- device/chip/local_chip.cpp | 25 +++++++++------------ device/cluster.cpp | 7 ++---- device/{chip => tt_device}/tlb_manager.cpp | 2 +- device/{chip => tt_device}/tlb_manager.h | 0 device/tt_device/tt_device.cpp | 3 +++ 10 files changed, 23 insertions(+), 29 deletions(-) rename device/{chip => tt_device}/tlb_manager.cpp (99%) rename device/{chip => tt_device}/tlb_manager.h (100%) diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt index 8a314896..7eed3b49 100644 --- a/device/CMakeLists.txt +++ b/device/CMakeLists.txt @@ -26,7 +26,7 @@ target_sources( chip/local_chip.cpp chip/mock_chip.cpp chip/remote_chip.cpp - chip/tlb_manager.cpp + tt_device/tlb_manager.cpp cluster.cpp coordinate_manager.cpp cpuset_lib.cpp diff --git a/device/api/umd/device/chip/chip.h b/device/api/umd/device/chip/chip.h index 77cfb6e3..aa9a0151 100644 --- a/device/api/umd/device/chip/chip.h +++ b/device/api/umd/device/chip/chip.h @@ -12,7 +12,6 @@ namespace tt::umd { class TTDevice; -class TLBManager; // An abstract class that represents a chip. class Chip { @@ -24,7 +23,6 @@ class Chip { tt_SocDescriptor& get_soc_descriptor(); virtual TTDevice* get_tt_device(); - virtual TLBManager* get_tlb_manager(); virtual bool is_mmio_capable() const = 0; diff --git a/device/api/umd/device/chip/local_chip.h b/device/api/umd/device/chip/local_chip.h index 55b09a43..b2a8e201 100644 --- a/device/api/umd/device/chip/local_chip.h +++ b/device/api/umd/device/chip/local_chip.h @@ -10,19 +10,15 @@ namespace tt::umd { -class TLBManager; - class LocalChip : public Chip { public: LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id); TTDevice* get_tt_device() override; - TLBManager* get_tlb_manager() override; bool is_mmio_capable() const override; private: std::unique_ptr tt_device_; - std::unique_ptr tlb_manager_; }; } // namespace tt::umd diff --git a/device/api/umd/device/tt_device/tt_device.h b/device/api/umd/device/tt_device/tt_device.h index 9a35c8d8..6fd52d75 100644 --- a/device/api/umd/device/tt_device/tt_device.h +++ b/device/api/umd/device/tt_device/tt_device.h @@ -6,6 +6,7 @@ #pragma once +#include "tt_device/tlb_manager.h" #include "umd/device/architecture_implementation.h" #include "umd/device/pci_device.hpp" @@ -28,6 +29,8 @@ struct dynamic_tlb { namespace tt::umd { +class TLBManager; + class TTDevice { public: /** @@ -39,6 +42,7 @@ class TTDevice { architecture_implementation *get_architecture_implementation(); PCIDevice *get_pci_device(); + TLBManager *get_tlb_manager(); void detect_hang_read(uint32_t data_read = c_hang_read_value); @@ -114,6 +118,7 @@ class TTDevice { protected: std::unique_ptr pci_device_; std::unique_ptr architecture_impl_; + std::unique_ptr tlb_manager_; tt::ARCH arch; bool is_hardware_hung(); diff --git a/device/chip/chip.cpp b/device/chip/chip.cpp index 5b28b3e1..9f995f77 100644 --- a/device/chip/chip.cpp +++ b/device/chip/chip.cpp @@ -14,6 +14,4 @@ tt_SocDescriptor& Chip::get_soc_descriptor() { return soc_descriptor_; } TTDevice* Chip::get_tt_device() { return nullptr; } -TLBManager* Chip::get_tlb_manager() { return nullptr; } - } // namespace tt::umd diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index 15d605ce..e67b1300 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -6,33 +6,30 @@ #include "umd/device/chip/local_chip.h" -#include "chip/tlb_manager.h" +#include "tt_device/tlb_manager.h" #include "umd/device/tt_device/tt_device.h" namespace tt::umd { LocalChip::LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id) : - Chip(soc_descriptor), - tt_device_(TTDevice::create(pci_device_id)), - tlb_manager_(std::make_unique(tt_device_.get())) { + Chip(soc_descriptor), tt_device_(TTDevice::create(pci_device_id)) { + auto tlb_manager = tt_device_->get_tlb_manager(); // Setup default dynamic tlbs. - tlb_manager_->set_dynamic_tlb_config( + tlb_manager->set_dynamic_tlb_config( "LARGE_READ_TLB", tt_device_->get_architecture_implementation()->get_mem_large_read_tlb()); - tlb_manager_->set_dynamic_tlb_config_ordering("LARGE_READ_TLB", tlb_data::Relaxed); - tlb_manager_->set_dynamic_tlb_config( + tlb_manager->set_dynamic_tlb_config_ordering("LARGE_READ_TLB", tlb_data::Relaxed); + tlb_manager->set_dynamic_tlb_config( "LARGE_WRITE_TLB", tt_device_->get_architecture_implementation()->get_mem_large_write_tlb()); - tlb_manager_->set_dynamic_tlb_config_ordering("LARGE_WRITE_TLB", tlb_data::Relaxed); - tlb_manager_->set_dynamic_tlb_config("REG_TLB", tt_device_->get_architecture_implementation()->get_reg_tlb()); - tlb_manager_->set_dynamic_tlb_config_ordering("REG_TLB", tlb_data::Relaxed); - tlb_manager_->set_dynamic_tlb_config( + tlb_manager->set_dynamic_tlb_config_ordering("LARGE_WRITE_TLB", tlb_data::Relaxed); + tlb_manager->set_dynamic_tlb_config("REG_TLB", tt_device_->get_architecture_implementation()->get_reg_tlb()); + tlb_manager->set_dynamic_tlb_config_ordering("REG_TLB", tlb_data::Relaxed); + tlb_manager->set_dynamic_tlb_config( "SMALL_READ_WRITE_TLB", tt_device_->get_architecture_implementation()->get_small_read_write_tlb()); - tlb_manager_->set_dynamic_tlb_config_ordering("SMALL_READ_WRITE_TLB", tlb_data::Relaxed); + tlb_manager->set_dynamic_tlb_config_ordering("SMALL_READ_WRITE_TLB", tlb_data::Relaxed); } TTDevice* LocalChip::get_tt_device() { return tt_device_.get(); } -TLBManager* LocalChip::get_tlb_manager() { return tlb_manager_.get(); } - bool LocalChip::is_mmio_capable() const { return true; } } // namespace tt::umd diff --git a/device/cluster.cpp b/device/cluster.cpp index aeb0778b..1e8a801c 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -36,9 +36,9 @@ #include #include -#include "chip/tlb_manager.h" #include "api/umd/device/tt_core_coordinates.h" #include "logger.hpp" +#include "tt_device/tlb_manager.h" #include "umd/device/architecture_implementation.h" #include "umd/device/chip/local_chip.h" #include "umd/device/chip/mock_chip.h" @@ -1663,10 +1663,7 @@ inline TTDevice* Cluster::get_tt_device(chip_id_t device_id) const { // Wrapper for throwing a more helpful exception when trying to access non pci enabled interface. inline TLBManager* Cluster::get_tlb_manager(chip_id_t device_id) const { - log_assert(chips_.find(device_id) != chips_.end(), "Device id {} not found in cluster.", device_id); - auto tlb_manager = chips_.at(device_id)->get_tlb_manager(); - log_assert(tlb_manager != nullptr, "TLBManager not found for device: {}", device_id); - return tlb_manager; + return get_tt_device(device_id)->get_tlb_manager(); } std::shared_ptr Cluster::get_mutex( diff --git a/device/chip/tlb_manager.cpp b/device/tt_device/tlb_manager.cpp similarity index 99% rename from device/chip/tlb_manager.cpp rename to device/tt_device/tlb_manager.cpp index 1ff6df2b..22914c51 100644 --- a/device/chip/tlb_manager.cpp +++ b/device/tt_device/tlb_manager.cpp @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "chip/tlb_manager.h" +#include "tt_device/tlb_manager.h" #include "logger.hpp" #include "umd/device/tt_device/tt_device.h" diff --git a/device/chip/tlb_manager.h b/device/tt_device/tlb_manager.h similarity index 100% rename from device/chip/tlb_manager.h rename to device/tt_device/tlb_manager.h diff --git a/device/tt_device/tt_device.cpp b/device/tt_device/tt_device.cpp index 8f74bad9..44fed999 100644 --- a/device/tt_device/tt_device.cpp +++ b/device/tt_device/tt_device.cpp @@ -15,6 +15,7 @@ TTDevice::TTDevice( std::unique_ptr pci_device, std::unique_ptr architecture_impl) : pci_device_(std::move(pci_device)), architecture_impl_(std::move(architecture_impl)), + tlb_manager_(std::make_unique(this)), arch(architecture_impl_->get_architecture()) {} /* static */ std::unique_ptr TTDevice::create(int pci_device_number) { @@ -36,6 +37,8 @@ architecture_implementation *TTDevice::get_architecture_implementation() { retur PCIDevice *TTDevice::get_pci_device() { return pci_device_.get(); } +TLBManager *TTDevice::get_tlb_manager() { return tlb_manager_.get(); } + bool TTDevice::is_hardware_hung() { volatile const void *addr = reinterpret_cast(pci_device_->bar0_uc) + (architecture_impl_->get_arc_reset_scratch_offset() + 6 * 4) - From 3fd042186046d9d1328d9721ccadbbabf4faace9 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Tue, 24 Dec 2024 15:31:16 +0000 Subject: [PATCH 08/17] wrote a test --- .../umd/device/architecture_implementation.h | 6 ++ .../api/umd/device/blackhole_implementation.h | 12 +++ .../api/umd/device/grayskull_implementation.h | 14 ++++ device/api/umd/device/tt_device/tt_device.h | 2 + .../api/umd/device/wormhole_implementation.h | 14 ++++ device/tt_device/tt_device.cpp | 2 + tests/api/CMakeLists.txt | 1 + tests/api/test_tlb_manager.cpp | 82 +++++++++++++++++++ 8 files changed, 133 insertions(+) create mode 100644 tests/api/test_tlb_manager.cpp diff --git a/device/api/umd/device/architecture_implementation.h b/device/api/umd/device/architecture_implementation.h index 8efc165b..dbf78be3 100644 --- a/device/api/umd/device/architecture_implementation.h +++ b/device/api/umd/device/architecture_implementation.h @@ -65,6 +65,12 @@ class architecture_implementation { virtual const std::vector& get_t6_x_locations() const = 0; virtual const std::vector& get_t6_y_locations() const = 0; + // TLB related. Move other functions here as well. + virtual std::pair get_tlb_1m_base_and_count() const = 0; + virtual std::pair get_tlb_2m_base_and_count() const = 0; + virtual std::pair get_tlb_16m_base_and_count() const = 0; + virtual std::pair get_tlb_4g_base_and_count() const = 0; + virtual std::tuple multicast_workaround(xy_pair start, xy_pair end) const = 0; virtual tlb_configuration get_tlb_configuration(uint32_t tlb_index) const = 0; virtual std::pair get_tlb_data( diff --git a/device/api/umd/device/blackhole_implementation.h b/device/api/umd/device/blackhole_implementation.h index d2820403..0ac0c805 100644 --- a/device/api/umd/device/blackhole_implementation.h +++ b/device/api/umd/device/blackhole_implementation.h @@ -304,6 +304,18 @@ class blackhole_implementation : public architecture_implementation { const std::vector& get_t6_y_locations() const override { return blackhole::T6_Y_LOCATIONS; } + std::pair get_tlb_1m_base_and_count() const override { return {0, 0}; } + + std::pair get_tlb_2m_base_and_count() const override { + return {blackhole::TLB_BASE_2M, blackhole::TLB_COUNT_2M}; + } + + std::pair get_tlb_16m_base_and_count() const override { return {0, 0}; } + + std::pair get_tlb_4g_base_and_count() const override { + return {blackhole::TLB_BASE_4G, blackhole::TLB_COUNT_4G}; + } + std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override; std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; diff --git a/device/api/umd/device/grayskull_implementation.h b/device/api/umd/device/grayskull_implementation.h index 2313415a..4e97a507 100644 --- a/device/api/umd/device/grayskull_implementation.h +++ b/device/api/umd/device/grayskull_implementation.h @@ -294,6 +294,20 @@ class grayskull_implementation : public architecture_implementation { const std::vector& get_t6_y_locations() const override { return grayskull::T6_Y_LOCATIONS; } + std::pair get_tlb_1m_base_and_count() const override { + return {grayskull::TLB_BASE_1M, grayskull::TLB_COUNT_1M}; + } + + std::pair get_tlb_2m_base_and_count() const override { + return {grayskull::TLB_BASE_2M, grayskull::TLB_COUNT_2M}; + } + + std::pair get_tlb_16m_base_and_count() const override { + return {grayskull::TLB_BASE_16M, grayskull::TLB_COUNT_16M}; + } + + std::pair get_tlb_4g_base_and_count() const override { return {0, 0}; } + std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override; std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; diff --git a/device/api/umd/device/tt_device/tt_device.h b/device/api/umd/device/tt_device/tt_device.h index 6fd52d75..6a0afc96 100644 --- a/device/api/umd/device/tt_device/tt_device.h +++ b/device/api/umd/device/tt_device/tt_device.h @@ -44,6 +44,8 @@ class TTDevice { PCIDevice *get_pci_device(); TLBManager *get_tlb_manager(); + tt::ARCH get_arch(); + void detect_hang_read(uint32_t data_read = c_hang_read_value); // Note: byte_addr is (mostly but not always) offset into BAR0. This diff --git a/device/api/umd/device/wormhole_implementation.h b/device/api/umd/device/wormhole_implementation.h index 0df17669..8179614a 100644 --- a/device/api/umd/device/wormhole_implementation.h +++ b/device/api/umd/device/wormhole_implementation.h @@ -328,6 +328,20 @@ class wormhole_implementation : public architecture_implementation { const std::vector& get_t6_y_locations() const override { return wormhole::T6_Y_LOCATIONS; } + std::pair get_tlb_1m_base_and_count() const override { + return {wormhole::TLB_BASE_1M, wormhole::TLB_COUNT_1M}; + } + + std::pair get_tlb_2m_base_and_count() const override { + return {wormhole::TLB_BASE_2M, wormhole::TLB_COUNT_2M}; + } + + std::pair get_tlb_16m_base_and_count() const override { + return {wormhole::TLB_BASE_16M, wormhole::TLB_COUNT_16M}; + } + + std::pair get_tlb_4g_base_and_count() const override { return {0, 0}; } + std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override; std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override; diff --git a/device/tt_device/tt_device.cpp b/device/tt_device/tt_device.cpp index 44fed999..0f3d5db3 100644 --- a/device/tt_device/tt_device.cpp +++ b/device/tt_device/tt_device.cpp @@ -39,6 +39,8 @@ PCIDevice *TTDevice::get_pci_device() { return pci_device_.get(); } TLBManager *TTDevice::get_tlb_manager() { return tlb_manager_.get(); } +tt::ARCH TTDevice::get_arch() { return arch; } + bool TTDevice::is_hardware_hung() { volatile const void *addr = reinterpret_cast(pci_device_->bar0_uc) + (architecture_impl_->get_arc_reset_scratch_offset() + 6 * 4) - diff --git a/tests/api/CMakeLists.txt b/tests/api/CMakeLists.txt index 8d12a1ae..eb189b9b 100644 --- a/tests/api/CMakeLists.txt +++ b/tests/api/CMakeLists.txt @@ -7,6 +7,7 @@ set(API_TESTS_SRCS test_core_coord_translation_bh.cpp test_mockup_device.cpp test_soc_descriptor.cpp + test_tlb_manager.cpp ) add_executable(api_tests ${API_TESTS_SRCS}) diff --git a/tests/api/test_tlb_manager.cpp b/tests/api/test_tlb_manager.cpp new file mode 100644 index 00000000..1f7ab42a --- /dev/null +++ b/tests/api/test_tlb_manager.cpp @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +// This file holds Chip specific API examples. + +#include + +#include "umd/device/tt_device/tt_device.h" +#include "umd/device/tt_io.hpp" +#include "umd/device/tt_soc_descriptor.h" + +using namespace tt::umd; + +std::unique_ptr get_tt_device() { + std::vector pci_device_ids = PCIDevice::enumerate_devices(); + if (pci_device_ids.empty()) { + return nullptr; + } + return TTDevice::create(pci_device_ids[0]); +} + +// TODO: Once default auto TLB setup is in, check it is setup properly. +TEST(ApiTLBManager, ManualTLBConfiguration) { + std::unique_ptr tt_device = get_tt_device(); + + if (tt_device == nullptr) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + + TLBManager* tlb_manager = tt_device->get_tlb_manager(); + tt_SocDescriptor soc_desc = tt_SocDescriptor::get_soc_descriptor_path(tt_device->get_arch()); + + // TODO: This should be part of TTDevice interface, not Cluster or Chip. + // Configure TLBs. + std::function get_static_tlb_index = [&](tt_xy_pair core) -> int { + // TODO: Make this per arch. + bool is_worker_core = soc_desc.is_worker_core(core); + if (!is_worker_core) { + return -1; + } + + auto tlb_index = core.x + core.y * tt_device->get_architecture_implementation()->get_grid_size_x(); + + auto tlb_1m_base_and_count = tt_device->get_architecture_implementation()->get_tlb_1m_base_and_count(); + auto tlb_2m_base_and_count = tt_device->get_architecture_implementation()->get_tlb_2m_base_and_count(); + + // Use either 1mb or 2mb tlbs. + if (tlb_1m_base_and_count.second > 0) { + // Expect that tlb index is within the number of 1mb TLBs. + EXPECT_TRUE(tlb_index < tlb_1m_base_and_count.second); + tlb_index += tlb_1m_base_and_count.first; + } else { + // Expect that tlb index is within the number of 1mb TLBs. + EXPECT_TRUE(tlb_index < tlb_2m_base_and_count.second); + tlb_index += tlb_2m_base_and_count.first; + } + + return tlb_index; + }; + + std::int32_t c_zero_address = 0; + + for (tt_xy_pair core : soc_desc.workers) { + tlb_manager->configure_tlb(core, get_static_tlb_index(core), c_zero_address, tlb_data::Relaxed); + } + + // So now that we have configured TLBs we can use it to interface with the TTDevice. + auto any_worker_core = soc_desc.workers[0]; + auto tlb_description = tlb_manager->get_tlb_configuration(any_worker_core); + + // TODO: Maybe accept tlb_index only? + uint64_t address_l1_to_write = 0; + std::vector buffer_to_write = {0x01, 0x02, 0x03, 0x04}; + tt_device->write_block( + tlb_description.tlb_offset + address_l1_to_write, buffer_to_write.size(), buffer_to_write.data()); + + // Another way to write to the TLB. + // TODO: This should be converted to AbstractIO writer. + tt::Writer writer = tlb_manager->get_static_tlb_writer(any_worker_core); + writer.write(address_l1_to_write, buffer_to_write[0]); +} From 282676d4f02b040afc964057f3e104f48e006a78 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Tue, 24 Dec 2024 15:36:54 +0000 Subject: [PATCH 09/17] minor --- device/cluster.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/device/cluster.cpp b/device/cluster.cpp index 1e8a801c..aaa6a463 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -1655,8 +1655,9 @@ void* Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, u // Wrapper for throwing a more helpful exception when trying to access non pci enabled interface. inline TTDevice* Cluster::get_tt_device(chip_id_t device_id) const { - log_assert(chips_.find(device_id) != chips_.end(), "Device id {} not found in cluster.", device_id); - auto tt_device = chips_.at(device_id)->get_tt_device(); + auto chip_it = chips_.find(device_id); + log_assert(chip_it != chips_.end(), "Device id {} not found in cluster.", device_id); + auto tt_device = chip_it->second->get_tt_device(); log_assert(tt_device != nullptr, "TTDevice not found for device: {}", device_id); return tt_device; } From ced817b6c0a4907803f421ea3ff32f432bdc61e7 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 25 Dec 2024 08:34:09 +0000 Subject: [PATCH 10/17] minor --- tests/api/test_tlb_manager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/api/test_tlb_manager.cpp b/tests/api/test_tlb_manager.cpp index 1f7ab42a..80ff266a 100644 --- a/tests/api/test_tlb_manager.cpp +++ b/tests/api/test_tlb_manager.cpp @@ -61,12 +61,12 @@ TEST(ApiTLBManager, ManualTLBConfiguration) { std::int32_t c_zero_address = 0; - for (tt_xy_pair core : soc_desc.workers) { + for (tt_xy_pair core : soc_desc.get_cores(CoreType::TENSIX)) { tlb_manager->configure_tlb(core, get_static_tlb_index(core), c_zero_address, tlb_data::Relaxed); } // So now that we have configured TLBs we can use it to interface with the TTDevice. - auto any_worker_core = soc_desc.workers[0]; + auto any_worker_core = soc_desc.get_cores(CoreType::TENSIX)[0]; auto tlb_description = tlb_manager->get_tlb_configuration(any_worker_core); // TODO: Maybe accept tlb_index only? From 3b17ff598d58e30182c3ec60211ccf5b9cc1ccf9 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 25 Dec 2024 08:47:34 +0000 Subject: [PATCH 11/17] fix after rebase --- device/cluster.cpp | 11 ++++++++--- device/tt_device/tlb_manager.cpp | 2 +- tests/api/test_tlb_manager.cpp | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/device/cluster.cpp b/device/cluster.cpp index aaa6a463..279c07f6 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -1059,7 +1059,7 @@ void Cluster::write_device_memory( small_access); if (get_tlb_manager(target.chip)->is_tlb_mapped({target.x, target.y}, address, size_in_bytes)) { - auto tlb_description = get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); + tlb_configuration tlb_description = get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); if (dev->get_pci_device()->bar4_wc != nullptr && tlb_description.size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset // to which we write so write_block knows it needs to target BAR4 @@ -1107,7 +1107,7 @@ void Cluster::read_device_memory( log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value()); if (get_tlb_manager(target.chip)->is_tlb_mapped({target.x, target.y}, address, size_in_bytes)) { - auto tlb_description = get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); + tlb_configuration tlb_description = get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); if (dev->get_pci_device()->bar4_wc != nullptr && tlb_description.size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset // from which we read so read_block knows it needs to target BAR4 @@ -1278,7 +1278,12 @@ Cluster::~Cluster() { cluster_desc.reset(); } -tlb_configuration Cluster::get_tlb_data_from_target(const tt_cxy_pair& target) { +std::optional> Cluster::get_tlb_data_from_target(const tt_cxy_pair& target) { + tlb_configuration tlb_configuration = get_tlb_configuration(target); + return std::tuple((uint32_t)tlb_configuration.tlb_offset, (uint32_t)tlb_configuration.size); +} + +tlb_configuration Cluster::get_tlb_configuration(const tt_cxy_pair& target) { return get_tlb_manager(target.chip)->get_tlb_configuration({target.x, target.y}); } diff --git a/device/tt_device/tlb_manager.cpp b/device/tt_device/tlb_manager.cpp index 22914c51..7bab3787 100644 --- a/device/tt_device/tlb_manager.cpp +++ b/device/tt_device/tlb_manager.cpp @@ -74,7 +74,7 @@ bool TLBManager::is_tlb_mapped(tt_xy_pair core, uint64_t address, uint32_t size_ } int32_t tlb_index = map_core_to_tlb_.at(core); - auto tlb_description = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index); + tlb_configuration tlb_description = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index); return address_in_tlb_space(address, size_in_bytes, tlb_index, tlb_description.size); } diff --git a/tests/api/test_tlb_manager.cpp b/tests/api/test_tlb_manager.cpp index 80ff266a..3c418d6d 100644 --- a/tests/api/test_tlb_manager.cpp +++ b/tests/api/test_tlb_manager.cpp @@ -67,7 +67,7 @@ TEST(ApiTLBManager, ManualTLBConfiguration) { // So now that we have configured TLBs we can use it to interface with the TTDevice. auto any_worker_core = soc_desc.get_cores(CoreType::TENSIX)[0]; - auto tlb_description = tlb_manager->get_tlb_configuration(any_worker_core); + tlb_configuration tlb_description = tlb_manager->get_tlb_configuration(any_worker_core); // TODO: Maybe accept tlb_index only? uint64_t address_l1_to_write = 0; From c0d48c1ac6c2ebc2f1c5e98ee10d2a5a926ccc02 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 25 Dec 2024 08:51:39 +0000 Subject: [PATCH 12/17] empty From 81409b03e3d5f5225f40de14d7f04965c27912b1 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 25 Dec 2024 10:07:53 +0000 Subject: [PATCH 13/17] minor build change --- device/{ => api/umd/device}/tt_device/tlb_manager.h | 2 ++ device/api/umd/device/tt_device/tt_device.h | 2 +- device/chip/local_chip.cpp | 2 +- device/cluster.cpp | 2 +- device/tt_device/tlb_manager.cpp | 2 +- 5 files changed, 6 insertions(+), 4 deletions(-) rename device/{ => api/umd/device}/tt_device/tlb_manager.h (98%) diff --git a/device/tt_device/tlb_manager.h b/device/api/umd/device/tt_device/tlb_manager.h similarity index 98% rename from device/tt_device/tlb_manager.h rename to device/api/umd/device/tt_device/tlb_manager.h index b8b2342d..a4690952 100644 --- a/device/tt_device/tlb_manager.h +++ b/device/api/umd/device/tt_device/tlb_manager.h @@ -5,6 +5,8 @@ */ #pragma once +#include + #include "umd/device/tt_xy_pair.h" #include "umd/device/types/tlb.h" diff --git a/device/api/umd/device/tt_device/tt_device.h b/device/api/umd/device/tt_device/tt_device.h index 6a0afc96..3728112d 100644 --- a/device/api/umd/device/tt_device/tt_device.h +++ b/device/api/umd/device/tt_device/tt_device.h @@ -6,9 +6,9 @@ #pragma once -#include "tt_device/tlb_manager.h" #include "umd/device/architecture_implementation.h" #include "umd/device/pci_device.hpp" +#include "umd/device/tt_device/tlb_manager.h" // TODO: Should be moved to blackhole_architecture_implementation.h // See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index e67b1300..78f99d6e 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -6,7 +6,7 @@ #include "umd/device/chip/local_chip.h" -#include "tt_device/tlb_manager.h" +#include "umd/device/tt_device/tlb_manager.h" #include "umd/device/tt_device/tt_device.h" namespace tt::umd { diff --git a/device/cluster.cpp b/device/cluster.cpp index 279c07f6..d7c53b88 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -38,7 +38,6 @@ #include "api/umd/device/tt_core_coordinates.h" #include "logger.hpp" -#include "tt_device/tlb_manager.h" #include "umd/device/architecture_implementation.h" #include "umd/device/chip/local_chip.h" #include "umd/device/chip/mock_chip.h" @@ -47,6 +46,7 @@ #include "umd/device/hugepage.h" #include "umd/device/tt_cluster_descriptor.h" #include "umd/device/tt_core_coordinates.h" +#include "umd/device/tt_device/tlb_manager.h" #include "umd/device/tt_soc_descriptor.h" #include "umd/device/types/arch.h" #include "umd/device/types/tlb.h" diff --git a/device/tt_device/tlb_manager.cpp b/device/tt_device/tlb_manager.cpp index 7bab3787..1b06d0d7 100644 --- a/device/tt_device/tlb_manager.cpp +++ b/device/tt_device/tlb_manager.cpp @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "tt_device/tlb_manager.h" +#include "umd/device/tt_device/tlb_manager.h" #include "logger.hpp" #include "umd/device/tt_device/tt_device.h" From 4ed727de91db8fd3cc4edfa4b9350bd24f54d6ef Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 25 Dec 2024 10:18:10 +0000 Subject: [PATCH 14/17] fix ordering setup --- device/chip/local_chip.cpp | 4 ---- device/tt_device/tlb_manager.cpp | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/device/chip/local_chip.cpp b/device/chip/local_chip.cpp index 78f99d6e..1bed106c 100644 --- a/device/chip/local_chip.cpp +++ b/device/chip/local_chip.cpp @@ -17,15 +17,11 @@ LocalChip::LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id) : // Setup default dynamic tlbs. tlb_manager->set_dynamic_tlb_config( "LARGE_READ_TLB", tt_device_->get_architecture_implementation()->get_mem_large_read_tlb()); - tlb_manager->set_dynamic_tlb_config_ordering("LARGE_READ_TLB", tlb_data::Relaxed); tlb_manager->set_dynamic_tlb_config( "LARGE_WRITE_TLB", tt_device_->get_architecture_implementation()->get_mem_large_write_tlb()); - tlb_manager->set_dynamic_tlb_config_ordering("LARGE_WRITE_TLB", tlb_data::Relaxed); tlb_manager->set_dynamic_tlb_config("REG_TLB", tt_device_->get_architecture_implementation()->get_reg_tlb()); - tlb_manager->set_dynamic_tlb_config_ordering("REG_TLB", tlb_data::Relaxed); tlb_manager->set_dynamic_tlb_config( "SMALL_READ_WRITE_TLB", tt_device_->get_architecture_implementation()->get_small_read_write_tlb()); - tlb_manager->set_dynamic_tlb_config_ordering("SMALL_READ_WRITE_TLB", tlb_data::Relaxed); } TTDevice* LocalChip::get_tt_device() { return tt_device_.get(); } diff --git a/device/tt_device/tlb_manager.cpp b/device/tt_device/tlb_manager.cpp index 1b06d0d7..2d6725cd 100644 --- a/device/tt_device/tlb_manager.cpp +++ b/device/tt_device/tlb_manager.cpp @@ -13,6 +13,8 @@ namespace tt::umd { +static constexpr uint64_t DEFAULT_ORDERING_MODE = tlb_data::Relaxed; + TLBManager::TLBManager(TTDevice* tt_device) : tt_device_(tt_device) {} void TLBManager::configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { @@ -41,6 +43,7 @@ void TLBManager::set_dynamic_tlb_config(std::string fallback_tlb_name, int32_t t "Dynamic TLB already configured for {}", fallback_tlb_name); dynamic_tlb_config_.insert({fallback_tlb_name, tlb_index}); + dynamic_tlb_ordering_modes_[fallback_tlb_name] = DEFAULT_ORDERING_MODE; } void TLBManager::set_dynamic_tlb_config_ordering(std::string fallback_tlb_name, uint64_t ordering) { From f6b9214c2ce11cede7f9de6c4ff05608b62be2e3 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 25 Dec 2024 11:04:48 +0000 Subject: [PATCH 15/17] tryout sth --- device/cluster.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/device/cluster.cpp b/device/cluster.cpp index d7c53b88..1e3af127 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -1299,6 +1299,11 @@ tlb_configuration Cluster::get_tlb_configuration(const chip_id_t chip, CoreCoord void Cluster::configure_tlb( chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { + std::cout << "harvested_coord_translation has input for these chip ides: "; + for (auto& [key, value] : harvested_coord_translation) { + std::cout << key << " "; + } + std::cout << std::endl; get_tlb_manager(logical_device_id) ->configure_tlb(harvested_coord_translation.at(logical_device_id).at(core), tlb_index, address, ordering); } From e33876bb95beb8d9154b14e58ec91433741fd5e6 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 25 Dec 2024 21:10:01 +0000 Subject: [PATCH 16/17] fixed test --- device/api/umd/device/tt_device/tlb_manager.h | 4 +++- device/cluster.cpp | 7 +------ device/tt_device/tlb_manager.cpp | 4 ++-- tests/api/test_tlb_manager.cpp | 2 +- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/device/api/umd/device/tt_device/tlb_manager.h b/device/api/umd/device/tt_device/tlb_manager.h index a4690952..61d67be3 100644 --- a/device/api/umd/device/tt_device/tlb_manager.h +++ b/device/api/umd/device/tt_device/tlb_manager.h @@ -22,7 +22,9 @@ class TLBManager { public: TLBManager(TTDevice* tt_device); - void configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering); + // TODO: Think about proper API which doesn't accept two cores. + // core should be in VIRTUAL coords, and translated_core should be in TRANSLATED coords. + void configure_tlb(tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering); void set_dynamic_tlb_config(std::string fallback_tlb_name, int32_t tlb_index); void set_dynamic_tlb_config_ordering(std::string fallback_tlb_name, uint64_t ordering); diff --git a/device/cluster.cpp b/device/cluster.cpp index 1e3af127..c7c7f629 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -1299,13 +1299,8 @@ tlb_configuration Cluster::get_tlb_configuration(const chip_id_t chip, CoreCoord void Cluster::configure_tlb( chip_id_t logical_device_id, tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { - std::cout << "harvested_coord_translation has input for these chip ides: "; - for (auto& [key, value] : harvested_coord_translation) { - std::cout << key << " "; - } - std::cout << std::endl; get_tlb_manager(logical_device_id) - ->configure_tlb(harvested_coord_translation.at(logical_device_id).at(core), tlb_index, address, ordering); + ->configure_tlb(core, harvested_coord_translation.at(logical_device_id).at(core), tlb_index, address, ordering); } void Cluster::configure_tlb( diff --git a/device/tt_device/tlb_manager.cpp b/device/tt_device/tlb_manager.cpp index 2d6725cd..331b041c 100644 --- a/device/tt_device/tlb_manager.cpp +++ b/device/tt_device/tlb_manager.cpp @@ -17,7 +17,7 @@ static constexpr uint64_t DEFAULT_ORDERING_MODE = tlb_data::Relaxed; TLBManager::TLBManager(TTDevice* tt_device) : tt_device_(tt_device) {} -void TLBManager::configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t address, uint64_t ordering) { +void TLBManager::configure_tlb(tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering) { log_assert( ordering == tlb_data::Strict || ordering == tlb_data::Posted || ordering == tlb_data::Relaxed, "Invalid ordering specified in Cluster::configure_tlb"); @@ -31,7 +31,7 @@ void TLBManager::configure_tlb(tt_xy_pair core, int32_t tlb_index, uint64_t addr ordering); log_assert(tlb_config_map_.find(tlb_index) == tlb_config_map_.end(), "TLB index already configured {}", tlb_index); - tt_device_->set_dynamic_tlb(tlb_index, core, address, ordering); + tt_device_->set_dynamic_tlb(tlb_index, translated_core, address, ordering); auto tlb_size = tt_device_->get_architecture_implementation()->get_tlb_configuration(tlb_index).size; tlb_config_map_.insert({tlb_index, (address / tlb_size) * tlb_size}); map_core_to_tlb_.insert({core, tlb_index}); diff --git a/tests/api/test_tlb_manager.cpp b/tests/api/test_tlb_manager.cpp index 3c418d6d..b9103fb0 100644 --- a/tests/api/test_tlb_manager.cpp +++ b/tests/api/test_tlb_manager.cpp @@ -62,7 +62,7 @@ TEST(ApiTLBManager, ManualTLBConfiguration) { std::int32_t c_zero_address = 0; for (tt_xy_pair core : soc_desc.get_cores(CoreType::TENSIX)) { - tlb_manager->configure_tlb(core, get_static_tlb_index(core), c_zero_address, tlb_data::Relaxed); + tlb_manager->configure_tlb(core, core, get_static_tlb_index(core), c_zero_address, tlb_data::Relaxed); } // So now that we have configured TLBs we can use it to interface with the TTDevice. From 594c76279540d4fb7c981c8602cc602b030f5d84 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 25 Dec 2024 22:03:03 +0000 Subject: [PATCH 17/17] fix clang --- device/api/umd/device/tt_device/tlb_manager.h | 3 ++- device/tt_device/tlb_manager.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/device/api/umd/device/tt_device/tlb_manager.h b/device/api/umd/device/tt_device/tlb_manager.h index 61d67be3..3ba1f3a4 100644 --- a/device/api/umd/device/tt_device/tlb_manager.h +++ b/device/api/umd/device/tt_device/tlb_manager.h @@ -24,7 +24,8 @@ class TLBManager { // TODO: Think about proper API which doesn't accept two cores. // core should be in VIRTUAL coords, and translated_core should be in TRANSLATED coords. - void configure_tlb(tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering); + void configure_tlb( + tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering); void set_dynamic_tlb_config(std::string fallback_tlb_name, int32_t tlb_index); void set_dynamic_tlb_config_ordering(std::string fallback_tlb_name, uint64_t ordering); diff --git a/device/tt_device/tlb_manager.cpp b/device/tt_device/tlb_manager.cpp index 331b041c..9f5da461 100644 --- a/device/tt_device/tlb_manager.cpp +++ b/device/tt_device/tlb_manager.cpp @@ -17,7 +17,8 @@ static constexpr uint64_t DEFAULT_ORDERING_MODE = tlb_data::Relaxed; TLBManager::TLBManager(TTDevice* tt_device) : tt_device_(tt_device) {} -void TLBManager::configure_tlb(tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering) { +void TLBManager::configure_tlb( + tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering) { log_assert( ordering == tlb_data::Strict || ordering == tlb_data::Posted || ordering == tlb_data::Relaxed, "Invalid ordering specified in Cluster::configure_tlb");