Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
broskoTT committed Dec 16, 2024
1 parent 1e214f4 commit 8ed9c36
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 76 deletions.
27 changes: 8 additions & 19 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,18 +107,6 @@ class tt_device {
throw std::runtime_error("---- tt_device::set_fallback_tlb_ordering_mode is not implemented\n");
}

/**
* Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per
* core).
*
* @param logical_device_id MMIO chip being targeted.
* @param mapping_function Function which maps core to TLB index.
*/
virtual void setup_core_to_tlb_map(
const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n");
}

/**
* Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to
* use a subset of cores from the active_eth_cores_per_chip set for all host->cluster non-MMIO transfers. If this
Expand Down Expand Up @@ -572,8 +560,6 @@ class Cluster : public tt_device {
uint64_t address,
uint64_t ordering = TLB_DATA::Posted);
virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted);
virtual void setup_core_to_tlb_map(
const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function);
virtual void configure_active_ethernet_cores_for_mmio_device(
chip_id_t mmio_chip, const std::unordered_set<tt_xy_pair>& active_eth_cores_per_chip);
virtual void start_device(const tt_device_params& device_params);
Expand Down Expand Up @@ -794,8 +780,16 @@ class Cluster : public tt_device {
int timeout = 1,
uint32_t* return_3 = nullptr,
uint32_t* return_4 = nullptr);

// TODO: These will be moved to a dedicated class for TLB management
bool address_in_tlb_space(
uint64_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip);
bool is_tlb_mapped(tt_cxy_pair target);
bool is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes);
// Note that these maps holds only entries for local PCIe chips.
std::map<chip_id_t, std::unordered_map<int32_t, uint64_t>> tlb_config_map = {};
std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, std::int32_t>> map_core_to_tlb_per_chip = {};

std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int pci_interface_id);
virtual uint32_t get_harvested_noc_rows_for_chip(
int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
Expand Down Expand Up @@ -864,11 +858,6 @@ class Cluster : public tt_device {
std::unordered_map<chip_id_t, std::unordered_set<tt_xy_pair>> workers_per_chip = {};
std::unordered_set<tt_xy_pair> eth_cores = {};
std::unordered_set<tt_xy_pair> dram_cores = {};
std::map<chip_id_t, std::unordered_map<int32_t, uint64_t>> tlb_config_map = {};

// Note that these maps holds only entries for local PCIe chips.
std::unordered_map<chip_id_t, std::function<std::int32_t(tt_xy_pair)>> map_core_to_tlb_per_chip = {};
std::unordered_map<chip_id_t, bool> tlbs_init_per_chip = {};

std::unordered_map<std::string, std::int32_t> dynamic_tlb_config = {};
std::unordered_map<std::string, uint64_t> dynamic_tlb_ordering_modes = {};
Expand Down
87 changes: 44 additions & 43 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,31 @@ bool Cluster::address_in_tlb_space(
return false;
}

bool Cluster::is_tlb_mapped(tt_cxy_pair target) {
if (map_core_to_tlb_per_chip.find(target.chip) == map_core_to_tlb_per_chip.end()) {
return false;
}

auto& map_core_to_tlb = map_core_to_tlb_per_chip.at(target.chip);
tt_xy_pair target_core = tt_xy_pair(target.x, target.y);

return map_core_to_tlb.find(target_core) != map_core_to_tlb.end();
}

bool Cluster::is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_in_bytes) {
if (!is_tlb_mapped(target)) {
return false;
}

auto* dev = get_tt_device(target.chip);

int32_t tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y));
auto tlb_description = dev->get_architecture_implementation()->describe_tlb(tlb_index);

return tlb_description.has_value() &&
address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_description.value()), target.chip);
}

void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) {
// These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here
// (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm
Expand Down Expand Up @@ -1074,23 +1099,18 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) {
throw std::runtime_error(fmt::format("Target not in MMIO chip: {}", target.str()));
}

if (!tlbs_init_per_chip[target.chip] || !map_core_to_tlb_per_chip[target.chip]) {
throw std::runtime_error("TLBs not initialized");
if (!is_tlb_mapped(target)) {
throw std::runtime_error(fmt::format("TLBs not initialized for core: {}", target.str()));
}

auto* dev = get_tt_device(target.chip);

if (!dev->get_pci_device()->bar0_wc) {
throw std::runtime_error("No write-combined mapping for BAR0");
}

auto tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
auto tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y));
auto tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);

if (!tlb_data.has_value()) {
throw std::runtime_error(fmt::format("No TLB mapped to core {}", target.str()));
}

auto [tlb_offset, tlb_size] = tlb_data.value();
auto* base = reinterpret_cast<uint8_t*>(dev->get_pci_device()->bar0_wc);

Expand All @@ -1116,16 +1136,10 @@ void Cluster::write_device_memory(
size_in_bytes,
small_access);

std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
if (tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
}

if (tlb_data.has_value() &&
address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
auto [tlb_offset, tlb_size] = tlb_data.value();
if (is_tlb_mapped(target, address, size_in_bytes)) {
auto tlb_description = dev->get_architecture_implementation()->describe_tlb(
map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)));
auto [tlb_offset, tlb_size] = tlb_description.value();
if (dev->get_pci_device()->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
// This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset
// to which we write so write_block knows it needs to target BAR4
Expand Down Expand Up @@ -1166,20 +1180,14 @@ void Cluster::read_device_memory(
address,
size_in_bytes);
TTDevice* dev = get_tt_device(target.chip);

uint8_t* buffer_addr = static_cast<uint8_t*>(mem_ptr);

std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint64_t, std::uint64_t>> tlb_data = std::nullopt;
if (tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index);
}
log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value());

if (tlb_data.has_value() &&
address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) {
auto [tlb_offset, tlb_size] = tlb_data.value();
if (is_tlb_mapped(target, address, size_in_bytes)) {
auto tlb_description = dev->get_architecture_implementation()->describe_tlb(
map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y)));
auto [tlb_offset, tlb_size] = tlb_description.value();
if (dev->get_pci_device()->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) {
// This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset
// from which we read so read_block knows it needs to target BAR4
Expand Down Expand Up @@ -1356,15 +1364,12 @@ Cluster::~Cluster() {
}

std::optional<std::tuple<uint32_t, uint32_t>> Cluster::get_tlb_data_from_target(const tt_cxy_pair& target) {
std::int32_t tlb_index = 0;
std::optional<std::tuple<std::uint32_t, std::uint32_t>> tlb_data;

if (tlbs_init_per_chip[target.chip]) {
tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y));
auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name);
tlb_data = architecture_implementation->describe_tlb(tlb_index);
if (!is_tlb_mapped(target)) {
return std::nullopt;
}
return tlb_data;

int tlb_index = map_core_to_tlb_per_chip.at(target.chip).at(tt_xy_pair(target.x, target.y));
return get_tt_device(target.chip)->get_architecture_implementation()->describe_tlb(tlb_index);
}

void Cluster::configure_tlb(
Expand All @@ -1377,8 +1382,10 @@ void Cluster::configure_tlb(
auto tlb_size = std::get<1>(tt_device->get_architecture_implementation()->describe_tlb(tlb_index).value());
if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) {
tlb_config_map.insert({logical_device_id, {}});
map_core_to_tlb_per_chip.insert({logical_device_id, {}});
}
tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size});
tlb_config_map.at(logical_device_id).insert({tlb_index, (address / tlb_size) * tlb_size});
map_core_to_tlb_per_chip.at(logical_device_id).insert({core, tlb_index});
}

void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) {
Expand Down Expand Up @@ -3321,12 +3328,6 @@ void Cluster::set_driver_eth_interface_params(const tt_driver_eth_interface_para
eth_interface_params = eth_interface_params_;
}

void Cluster::setup_core_to_tlb_map(
const chip_id_t logical_device_id, std::function<std::int32_t(tt_xy_pair)> mapping_function) {
map_core_to_tlb_per_chip[logical_device_id] = mapping_function;
tlbs_init_per_chip[logical_device_id] = true;
}

std::uint32_t Cluster::get_num_dram_channels(std::uint32_t device_id) {
log_assert(
target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(),
Expand Down
2 changes: 0 additions & 2 deletions tests/api/test_chip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@ TEST(ApiChipTest, ManualTLBConfiguration) {
for (tt_xy_pair core : soc_desc.workers) {
umd_cluster->configure_tlb(mmio_chip, core, get_static_tlb_index(core), c_zero_address);
}

umd_cluster->setup_core_to_tlb_map(mmio_chip, get_static_tlb_index);
}

// Expect not to throw for now configured mmio chip, same one as before.
Expand Down
4 changes: 0 additions & 4 deletions tests/blackhole/test_silicon_driver_bh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ TEST(SiliconDriverBH, CreateDestroy) {
// }
// }
// }
// device.setup_core_to_tlb_map(get_static_tlb_index_callback);

// tt_device_params default_params;
// device.start_device(default_params);
Expand Down Expand Up @@ -292,7 +291,6 @@ TEST(SiliconDriverBH, UnalignedStaticTLB_RW) {
device.configure_tlb(
i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -349,7 +347,6 @@ TEST(SiliconDriverBH, StaticTLB_RW) {
device.configure_tlb(
i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -570,7 +567,6 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}

tt_device_params default_params;
Expand Down
3 changes: 0 additions & 3 deletions tests/grayskull/test_silicon_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ TEST(SiliconDriverGS, HarvestingRuntime) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
}

tt_device_params default_params;
Expand Down Expand Up @@ -199,7 +198,6 @@ TEST(SiliconDriverGS, StaticTLB_RW) {
device.configure_tlb(
i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
}

tt_device_params default_params;
Expand Down Expand Up @@ -417,7 +415,6 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index(core), base_addr);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index);
}

tt_device_params default_params;
Expand Down
5 changes: 0 additions & 5 deletions tests/wormhole/test_silicon_driver_wh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@ TEST(SiliconDriverWH, HarvestingRuntime) {
}
}
}
device.setup_core_to_tlb_map(get_static_tlb_index_callback);

tt_device_params default_params;
device.start_device(default_params);
Expand Down Expand Up @@ -233,7 +232,6 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) {
device.configure_tlb(
i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -289,7 +287,6 @@ TEST(SiliconDriverWH, StaticTLB_RW) {
device.configure_tlb(
i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -474,7 +471,6 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) {
// Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE.
device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr);
}
device.setup_core_to_tlb_map(i, get_static_tlb_index_callback);
}
}

Expand Down Expand Up @@ -954,7 +950,6 @@ TEST(SiliconDriverWH, LargeAddressTlb) {
cluster.start_device(tt_device_params{});

auto get_static_tlb_index_callback = [](tt_xy_pair target) { return 0; };
cluster.setup_core_to_tlb_map(0, get_static_tlb_index_callback);

// Address of the reset unit in ARC core:
uint64_t arc_reset_noc = 0x880030000ULL;
Expand Down

0 comments on commit 8ed9c36

Please sign in to comment.