Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change mutex to use logical_device_id #416

Merged
merged 2 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ class Cluster : public tt_device {
const uint32_t& num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources);
void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm);
void initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm);
void cleanup_shared_host_state();
void initialize_pcie_devices();
void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& cores);
Expand Down Expand Up @@ -747,7 +747,7 @@ class Cluster : public tt_device {
std::map<chip_id_t, std::unordered_map<int32_t, uint64_t>> tlb_config_map = {};
std::unordered_map<chip_id_t, std::unordered_map<tt_xy_pair, std::int32_t>> map_core_to_tlb_per_chip = {};

std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int pci_interface_id);
std::shared_ptr<boost::interprocess::named_mutex> get_mutex(const std::string& tlb_name, int logical_device_id);
virtual uint32_t get_harvested_noc_rows_for_chip(
int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips
void generate_tensix_broadcast_grids_for_grayskull(
Expand Down
37 changes: 17 additions & 20 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ bool Cluster::is_tlb_mapped(tt_cxy_pair target, uint64_t address, uint32_t size_
address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_description.value()), target.chip);
}

void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) {
void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm) {
// These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here
// (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm
// is tied to clean_system_resources from the constructor. The main process is responsible for initializing the
Expand All @@ -192,7 +192,7 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup

// Initialize Dynamic TLB mutexes
for (auto& tlb : dynamic_tlb_config) {
mutex_name = tlb.first + std::to_string(pci_interface_id);
mutex_name = tlb.first + std::to_string(logical_device_id);
if (cleanup_mutexes_in_shm) {
named_mutex::remove(mutex_name.c_str());
}
Expand All @@ -201,15 +201,15 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup
}

// Initialize ARC core mutex
mutex_name = fmt::format("ARC_MSG{}", pci_interface_id);
mutex_name = fmt::format("ARC_MSG{}", logical_device_id);
if (cleanup_mutexes_in_shm) {
named_mutex::remove(mutex_name.c_str());
}
hardware_resource_mutex_map[mutex_name] =
std::make_shared<named_mutex>(open_or_create, mutex_name.c_str(), unrestricted_permissions);

if (arch_name == tt::ARCH::WORMHOLE_B0) {
mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(pci_interface_id);
mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(logical_device_id);
// Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for
// ethernet broadcast
if (cleanup_mutexes_in_shm) {
Expand All @@ -220,7 +220,7 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup
}

// Initialize interprocess mutexes to make host -> device memory barriers atomic
mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(pci_interface_id);
mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(logical_device_id);
if (cleanup_mutexes_in_shm) {
named_mutex::remove(mutex_name.c_str());
}
Expand Down Expand Up @@ -266,7 +266,7 @@ void Cluster::create_device(
pci_device->get_device_num(),
pci_device->revision_id);

initialize_interprocess_mutexes(pci_device->get_device_num(), clean_system_resources);
initialize_interprocess_mutexes(logical_device_id, clean_system_resources);

// MT: Initial BH - hugepages will fail init
// For using silicon driver without workload to query mission mode params, no need for hugepage.
Expand Down Expand Up @@ -1110,7 +1110,7 @@ void Cluster::write_device_memory(
}
} else {
const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, target.chip));

while (size_in_bytes > 0) {
auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(
Expand Down Expand Up @@ -1158,7 +1158,7 @@ void Cluster::read_device_memory(
log_debug(LogSiliconDriver, " read_block called with tlb_offset: {}, tlb_size: {}", tlb_offset, tlb_size);
} else {
const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, dev->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, target.chip));
log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index);
while (size_in_bytes > 0) {
auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(
Expand Down Expand Up @@ -1524,7 +1524,7 @@ int Cluster::pcie_arc_msg(

// Exclusive access for a single process at a time. Based on physical pci interface id.
std::string msg_type = "ARC_MSG";
const scoped_lock<named_mutex> lock(*get_mutex(msg_type, tt_device->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(msg_type, logical_device_id));
uint32_t fw_arg = arg0 | (arg1 << 16);
int exit_code = 0;

Expand Down Expand Up @@ -1718,8 +1718,8 @@ inline TTDevice* Cluster::get_tt_device(chip_id_t device_id) const {
}

std::shared_ptr<boost::interprocess::named_mutex> Cluster::get_mutex(
const std::string& tlb_name, int pci_interface_id) {
std::string mutex_name = tlb_name + std::to_string(pci_interface_id);
const std::string& tlb_name, int logical_device_id) {
std::string mutex_name = tlb_name + std::to_string(logical_device_id);
return hardware_resource_mutex_map.at(mutex_name);
}

Expand Down Expand Up @@ -1855,8 +1855,7 @@ void Cluster::write_to_non_mmio_device(
// MUTEX ACQUIRE (NON-MMIO)
// do not locate any ethernet core reads/writes before this acquire
//
const scoped_lock<named_mutex> lock(*get_mutex(
NON_MMIO_MUTEX_NAME, this->get_tt_device(mmio_capable_chip_logical)->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, mmio_capable_chip_logical));

int& active_core_for_txn =
non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core;
Expand Down Expand Up @@ -2081,8 +2080,7 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
// MUTEX ACQUIRE (NON-MMIO)
// do not locate any ethernet core reads/writes before this acquire
//
const scoped_lock<named_mutex> lock(*get_mutex(
NON_MMIO_MUTEX_NAME, this->get_tt_device(mmio_capable_chip_logical)->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(NON_MMIO_MUTEX_NAME, mmio_capable_chip_logical));
const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0);

read_device_memory(
Expand Down Expand Up @@ -2511,7 +2509,7 @@ void Cluster::pcie_broadcast_write(
TTDevice* tt_device = get_tt_device(chip);
const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const uint8_t* buffer_addr = static_cast<const uint8_t*>(mem_ptr);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, tt_device->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, chip));
while (size_in_bytes > 0) {
auto [mapped_address, tlb_size] = tt_device->set_dynamic_tlb_broadcast(
tlb_index,
Expand Down Expand Up @@ -2880,8 +2878,7 @@ void Cluster::insert_host_to_device_barrier(
const uint32_t barrier_addr,
const std::string& fallback_tlb) {
// Ensure that this memory barrier is atomic across processes/threads
const scoped_lock<named_mutex> lock(
*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_tt_device(chip)->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, chip));
set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb);
set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb);
}
Expand Down Expand Up @@ -2995,7 +2992,7 @@ void Cluster::read_mmio_device_register(
TTDevice* tt_device = get_tt_device(core.chip);

const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, tt_device->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, core.chip));
log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index);

auto [mapped_address, tlb_size] = tt_device->set_dynamic_tlb(
Expand All @@ -3015,7 +3012,7 @@ void Cluster::write_mmio_device_register(
TTDevice* tt_device = get_tt_device(core.chip);

const auto tlb_index = dynamic_tlb_config.at(fallback_tlb);
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, tt_device->get_pci_device()->get_device_num()));
const scoped_lock<named_mutex> lock(*get_mutex(fallback_tlb, core.chip));
log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index);

auto [mapped_address, tlb_size] = tt_device->set_dynamic_tlb(
Expand Down
Loading