Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLB Manager #420

Merged
merged 17 commits into from
Dec 26, 2024
1 change: 1 addition & 0 deletions device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ target_sources(
chip/local_chip.cpp
chip/mock_chip.cpp
chip/remote_chip.cpp
tt_device/tlb_manager.cpp
cluster.cpp
coordinate_manager.cpp
cpuset_lib.cpp
Expand Down
6 changes: 6 additions & 0 deletions device/api/umd/device/architecture_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ class architecture_implementation {
virtual const std::vector<uint32_t>& get_t6_x_locations() const = 0;
virtual const std::vector<uint32_t>& get_t6_y_locations() const = 0;

// TLB related. Move other functions here as well.
virtual std::pair<uint32_t, uint32_t> get_tlb_1m_base_and_count() const = 0;
virtual std::pair<uint32_t, uint32_t> get_tlb_2m_base_and_count() const = 0;
virtual std::pair<uint32_t, uint32_t> get_tlb_16m_base_and_count() const = 0;
virtual std::pair<uint32_t, uint32_t> get_tlb_4g_base_and_count() const = 0;

virtual std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const = 0;
virtual tlb_configuration get_tlb_configuration(uint32_t tlb_index) const = 0;
virtual std::pair<std::uint64_t, std::uint64_t> get_tlb_data(
Expand Down
12 changes: 12 additions & 0 deletions device/api/umd/device/blackhole_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,18 @@ class blackhole_implementation : public architecture_implementation {

const std::vector<uint32_t>& get_t6_y_locations() const override { return blackhole::T6_Y_LOCATIONS; }

std::pair<uint32_t, uint32_t> get_tlb_1m_base_and_count() const override { return {0, 0}; }

std::pair<uint32_t, uint32_t> get_tlb_2m_base_and_count() const override {
return {blackhole::TLB_BASE_2M, blackhole::TLB_COUNT_2M};
}

std::pair<uint32_t, uint32_t> get_tlb_16m_base_and_count() const override { return {0, 0}; }

std::pair<uint32_t, uint32_t> get_tlb_4g_base_and_count() const override {
return {blackhole::TLB_BASE_4G, blackhole::TLB_COUNT_4G};
}

std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override;
std::pair<std::uint64_t, std::uint64_t> get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override;
Expand Down
1 change: 1 addition & 0 deletions device/api/umd/device/chip/local_chip.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "umd/device/chip/chip.h"

namespace tt::umd {

class LocalChip : public Chip {
public:
LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id);
Expand Down
12 changes: 1 addition & 11 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,7 @@ class Cluster : public tt_device {
// TODO: This should be accessible through public API, probably to be moved to tt_device.
PCIDevice* get_pci_device(int device_id) const;
TTDevice* get_tt_device(chip_id_t device_id) const;
TLBManager* get_tlb_manager(chip_id_t device_id) const;
const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const;

// Existing API we want to remove. UMD is transitioning to use CoreCoord instead of tt_xy_pair.
Expand Down Expand Up @@ -836,15 +837,6 @@ class Cluster : public tt_device {
uint32_t* return_3 = nullptr,
uint32_t* return_4 = nullptr);

// TODO: These will be moved to a dedicated class for TLB management
bool address_in_tlb_space(
uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
bool is_tlb_mapped(tt_cxy_pair target);
bool is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes);
// Note that these maps holds only entries for local PCIe chips.
std::map<chip_id_t, std::unordered_map<int32_t, uint64_t>> tlb_config_map = {};
std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, std::int32_t>> map_core_to_tlb_per_chip = {};

std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int logical_device_id);
virtual uint32_t get_harvested_noc_rows_for_chip(
int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
Expand Down Expand Up @@ -914,8 +906,6 @@ class Cluster : public tt_device {
std::unordered_set<tt_xy_pair> eth_cores = {};
std::unordered_set<tt_xy_pair> dram_cores = {};

std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
std::unordered_map<std::string, uint64_t> dynamic_tlb_ordering_modes = {};
std::map<std::set<chip_id_t>, std::unordered_map<chip_id_t, std::vector<std::vector<int>>>> bcast_header_cache = {};
bool perform_harvesting_on_sdesc = false;
bool use_ethernet_ordered_writes = true;
Expand Down
14 changes: 14 additions & 0 deletions device/api/umd/device/grayskull_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,20 @@ class grayskull_implementation : public architecture_implementation {

const std::vector<uint32_t>& get_t6_y_locations() const override { return grayskull::T6_Y_LOCATIONS; }

std::pair<uint32_t, uint32_t> get_tlb_1m_base_and_count() const override {
return {grayskull::TLB_BASE_1M, grayskull::TLB_COUNT_1M};
}

std::pair<uint32_t, uint32_t> get_tlb_2m_base_and_count() const override {
return {grayskull::TLB_BASE_2M, grayskull::TLB_COUNT_2M};
}

std::pair<uint32_t, uint32_t> get_tlb_16m_base_and_count() const override {
return {grayskull::TLB_BASE_16M, grayskull::TLB_COUNT_16M};
}

std::pair<uint32_t, uint32_t> get_tlb_4g_base_and_count() const override { return {0, 0}; }

std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override;
std::pair<std::uint64_t, std::uint64_t> get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override;
Expand Down
51 changes: 51 additions & 0 deletions device/api/umd/device/tt_device/tlb_manager.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc.
*
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once

#include <unordered_map>

#include "umd/device/tt_xy_pair.h"
#include "umd/device/types/tlb.h"

namespace tt {
class Writer;
}

namespace tt::umd {

class TTDevice;

class TLBManager {
public:
TLBManager(TTDevice* tt_device);

// TODO: Think about proper API which doesn't accept two cores.
// core should be in VIRTUAL coords, and translated_core should be in TRANSLATED coords.
void configure_tlb(
tt_xy_pair core, tt_xy_pair translated_core, int32_t tlb_index, uint64_t address, uint64_t ordering);

void set_dynamic_tlb_config(std::string fallback_tlb_name, int32_t tlb_index);
void set_dynamic_tlb_config_ordering(std::string fallback_tlb_name, uint64_t ordering);

bool address_in_tlb_space(uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size);
bool is_tlb_mapped(tt_xy_pair core);
bool is_tlb_mapped(tt_xy_pair core, uint64_t address, uint32_t size_in_bytes);

tt::Writer get_static_tlb_writer(tt_xy_pair core);
tlb_configuration get_tlb_configuration(tt_xy_pair core);

// TODO: the following members will be moved to private once enough stuff is moved out of cluster.
std::unordered_map<int32_t, uint64_t> tlb_config_map_;
std::unordered_map<tt_xy_pair, std::int32_t> map_core_to_tlb_;

std::unordered_map<std::string, std::int32_t> dynamic_tlb_config_;
std::unordered_map<std::string, uint64_t> dynamic_tlb_ordering_modes_;

private:
TTDevice* tt_device_;
};

} // namespace tt::umd
7 changes: 7 additions & 0 deletions device/api/umd/device/tt_device/tt_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "umd/device/architecture_implementation.h"
#include "umd/device/pci_device.hpp"
#include "umd/device/tt_device/tlb_manager.h"

// TODO: Should be moved to blackhole_architecture_implementation.h
// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
Expand All @@ -28,6 +29,8 @@ struct dynamic_tlb {

namespace tt::umd {

class TLBManager;

class TTDevice {
public:
/**
Expand All @@ -39,6 +42,9 @@ class TTDevice {

architecture_implementation *get_architecture_implementation();
PCIDevice *get_pci_device();
TLBManager *get_tlb_manager();

tt::ARCH get_arch();

void detect_hang_read(uint32_t data_read = c_hang_read_value);

Expand Down Expand Up @@ -114,6 +120,7 @@ class TTDevice {
protected:
std::unique_ptr<PCIDevice> pci_device_;
std::unique_ptr<architecture_implementation> architecture_impl_;
std::unique_ptr<TLBManager> tlb_manager_;
tt::ARCH arch;

bool is_hardware_hung();
Expand Down
2 changes: 1 addition & 1 deletion device/api/umd/device/tt_io.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Cluster;
* It is the caller's responsibility to manage the lifetime of Writer objects.
*/
class Writer {
friend class tt::umd::Cluster;
friend class tt::umd::TLBManager;

public:
/**
Expand Down
14 changes: 14 additions & 0 deletions device/api/umd/device/wormhole_implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,20 @@ class wormhole_implementation : public architecture_implementation {

const std::vector<uint32_t>& get_t6_y_locations() const override { return wormhole::T6_Y_LOCATIONS; }

std::pair<uint32_t, uint32_t> get_tlb_1m_base_and_count() const override {
return {wormhole::TLB_BASE_1M, wormhole::TLB_COUNT_1M};
}

std::pair<uint32_t, uint32_t> get_tlb_2m_base_and_count() const override {
return {wormhole::TLB_BASE_2M, wormhole::TLB_COUNT_2M};
}

std::pair<uint32_t, uint32_t> get_tlb_16m_base_and_count() const override {
return {wormhole::TLB_BASE_16M, wormhole::TLB_COUNT_16M};
}

std::pair<uint32_t, uint32_t> get_tlb_4g_base_and_count() const override { return {0, 0}; }

std::tuple<xy_pair, xy_pair> multicast_workaround(xy_pair start, xy_pair end) const override;
tlb_configuration get_tlb_configuration(uint32_t tlb_index) const override;
std::pair<std::uint64_t, std::uint64_t> get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const override;
Expand Down
14 changes: 13 additions & 1 deletion device/chip/local_chip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,26 @@

#include "umd/device/chip/local_chip.h"

#include "umd/device/tt_device/tlb_manager.h"
#include "umd/device/tt_device/tt_device.h"

namespace tt::umd {

LocalChip::LocalChip(tt_SocDescriptor soc_descriptor, int pci_device_id) :
Chip(soc_descriptor), tt_device_(TTDevice::create(pci_device_id)) {}
Chip(soc_descriptor), tt_device_(TTDevice::create(pci_device_id)) {
auto tlb_manager = tt_device_->get_tlb_manager();
// Setup default dynamic tlbs.
tlb_manager->set_dynamic_tlb_config(
"LARGE_READ_TLB", tt_device_->get_architecture_implementation()->get_mem_large_read_tlb());
tlb_manager->set_dynamic_tlb_config(
"LARGE_WRITE_TLB", tt_device_->get_architecture_implementation()->get_mem_large_write_tlb());
tlb_manager->set_dynamic_tlb_config("REG_TLB", tt_device_->get_architecture_implementation()->get_reg_tlb());
tlb_manager->set_dynamic_tlb_config(
"SMALL_READ_WRITE_TLB", tt_device_->get_architecture_implementation()->get_small_read_write_tlb());
}

TTDevice* LocalChip::get_tt_device() { return tt_device_.get(); }

bool LocalChip::is_mmio_capable() const { return true; }

} // namespace tt::umd
Loading
Loading